Library¶
import pandas as pd
import numpy as np
import PreProcessingText as ppt
from collections import Counter, defaultdict
import seaborn as sns
from wordcloud import WordCloud
import networkx as nx
import matplotlib.pyplot as plt
import squarify
from transformers import pipeline
from tqdm import tqdm
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer, util
from bertopic.representation import MaximalMarginalRelevance, KeyBERTInspired, PartOfSpeech
from sklearn.feature_extraction.text import CountVectorizer
from keybert import KeyBERT
from umap import UMAP
import hdbscan
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import csv
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.cluster import KMeans
from scipy.spatial import distance
from scipy.cluster import hierarchy as sch
from sklearn.cluster import AgglomerativeClustering
from matplotlib.colors import ListedColormap
from gensim.models.coherencemodel import CoherenceModel
from gensim import corpora
4° Approach: BERTopic¶
Baseline Summary¶
Clustering Approach¶
- Parameter Setting: A high parameter was set for HDBSCAN to ensure well-defined clusters that occupy a significant percentage of the total dataset. This baseline is intended to feed machine learning algorithms for prediction purposes.
Initial Clustering Results¶
Clusters Retrieved: 7 representative clusters were identified:
- Drug sales
- Bitcoin
- Scammers and seller reviews
- Marketplace advertising
- Purchase reviews
- Drug purchases
- Orders
Outliers: Initially, 34k outliers were found out of a total of 66k records.
Performance Metrics:
- Silhouette Score: 0.64
- Davies-Bouldin Score: 0.6
Outlier Reduction¶
Cosine Measure on Embeddings: By applying a cosine similarity measure with a 0.53 threshold, the number of outliers was reduced from 34k to 27k, reintroducing about 7k records.
Updated Performance Metrics:
- Silhouette Score: 0.51
- Davies-Bouldin Score: 0.8
Trade-off Analysis¶
Outlier Reintroduction: Reintroducing the outliers found a balance that prevented significant cluster degradation while keeping clusters well separated and defined, as evidenced by the graphs.
Cluster Distribution: The updated clusters are well-distributed:
- Maximum cluster size: 23% of the total dataset
- Minimum cluster size: 7% of the total dataset
- This distribution avoids large excursions.
Data Loss and Potential Adjustments¶
Data Loss: Approximately 40% of the initial dataset was lost.
Potential Correction: This data loss can potentially be mitigated by lowering the cosine similarity threshold between embeddings.
df = pd.read_csv('cleaned_data_name_thread.csv')
df = df.dropna(subset=['name_thread'])
df = df.drop_duplicates(subset=['name_thread'], keep='first')
df.shape[0]
66735
model = SentenceTransformer('distiluse-base-multilingual-cased-v1')
tc1 = ppt.TextClustering(df, 'name_thread')
tc1.encode_corpus(model, batch_size=64, to_tensor=False)
len(tc1.corpus), len(tc1.corpus_embeddings)
seed_topic_list = [[
'tor site', 'drug', 'cocaine', 'ketamine', 'weed', 'trafficking', 'scammer', 'market', 'vendor', 'bitcoin',
'mdma', 'coke', 'lsd', 'heroine', 'xanax', 'tor node', 'tor site', 'gun', 'weapon', 'hacking'
]]
zeroshot_topic_list = [pd.read_csv('../../../intent_crime.csv')['intent'].tolist()]
representation_model = MaximalMarginalRelevance(diversity=0.3)
vectorizer_model = CountVectorizer(stop_words="english")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
umap_model = UMAP(n_neighbors=15, n_components=10, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=1200, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
topic_model = BERTopic(
language='multilingual',
top_n_words=10,
n_gram_range=(1, 2),
umap_model=umap_model,
hdbscan_model=hdbscan_model,
seed_topic_list=seed_topic_list,
vectorizer_model=vectorizer_model,
ctfidf_model=ctfidf_model,
representation_model=representation_model,
zeroshot_topic_list=zeroshot_topic_list,
zeroshot_min_similarity=.05,
verbose=True
)
topics, probs = topic_model.fit_transform(tc1.corpus, tc1.corpus_embeddings)
print(topic_model.get_topic_info())
for topic_id in set(topics):
print(f"Topic {topic_id}:")
print(topic_model.get_topic(topic_id))
Topic Count Name \
0 -1 34449 -1_new_free_ticket_help
1 0 7495 0_weed_xanax_cocaine_coke
2 1 6093 1_market_dream_empire_nightmare
3 2 5034 2_vendor_scammer_scam_scamming
4 3 4087 3_review_vendor_feedback_mdma
5 4 4003 4_mdma_lsd_shit_whats
6 5 2402 5_order_package_delivery_shipping
7 6 1966 6_bitcoin_card_wallet_credit
Representation \
0 [new, free, ticket, help, update, account, mdm...
1 [weed, xanax, cocaine, coke, ketamine, mg, can...
2 [market, dream, empire, nightmare, vendor, wal...
3 [vendor, scammer, scam, scamming, exit, scamme...
4 [review, vendor, feedback, mdma, mg, sample, r...
5 [mdma, lsd, shit, whats, fuck, gone, got, guy,...
6 [order, package, delivery, shipping, tracking,...
7 [bitcoin, card, wallet, credit, coin, carding,...
Representative_Docs
0 [canadianflavor weed shatter cbd edible hash c...
1 [high quality weed thc product europe, new xan...
2 [next market, dream market vendor rstclass nig...
3 [looking good reliable vendor sell ounce, vend...
4 [empire vendor cocaine review, first ever revi...
5 [hey ro im gon na pull pk, life wonderful life...
6 [order accepted day still hasnt marked shipped...
7 [credit cards paypal prepaid card find, got cc...
Topic 0:
[('weed', 0.5972313505812425), ('xanax', 0.5664832282989213), ('cocaine', 0.5350787342936356), ('coke', 0.4710111701375004), ('ketamine', 0.46985128023380035), ('mg', 0.46256209204548415), ('cannabis', 0.41853925594172725), ('drug', 0.4053330171594432), ('pill', 0.3907822559981816), ('quality', 0.38621568363790615)]
Topic 1:
[('market', 0.892430998800942), ('dream', 0.6865843677324943), ('empire', 0.6830028029033173), ('nightmare', 0.5681939396872522), ('vendor', 0.34305231363817884), ('wall', 0.3245499595042113), ('marketplace', 0.319921898437173), ('scam', 0.2961241301762431), ('exit', 0.2960733863924834), ('link', 0.2915460778160393)]
Topic 2:
[('vendor', 0.6950361459297074), ('scammer', 0.6725026815231682), ('scam', 0.4980623980369779), ('scamming', 0.46575246018365657), ('exit', 0.44160475610894967), ('scammed', 0.40051759892624533), ('looking', 0.37884048200047027), ('warning', 0.37715463753082534), ('reliable', 0.37144259341974245), ('buyer', 0.3708904841304073)]
Topic 3:
[('review', 1.002255217202406), ('vendor', 0.5076272530565451), ('feedback', 0.4049037794348937), ('mdma', 0.381329954044546), ('mg', 0.37619091451980585), ('sample', 0.3754397070467268), ('reviews', 0.3504300951320543), ('lsd', 0.3465899767001684), ('opinion', 0.3303160657068881), ('xanax', 0.33022254366369147)]
Topic 4:
[('mdma', 0.38275973612659386), ('lsd', 0.3779572278615291), ('shit', 0.35340590919386444), ('whats', 0.34834774258692336), ('fuck', 0.3264035078860319), ('gone', 0.31797094824590016), ('got', 0.3167851762249627), ('guy', 0.3153758862961693), ('dead', 0.31361936874635366), ('going', 0.3042237209259171)]
Topic 5:
[('order', 0.9350712100343167), ('package', 0.6655706541276237), ('delivery', 0.562721266995139), ('shipping', 0.527231820138037), ('tracking', 0.5122872117651205), ('shipped', 0.48839280205239965), ('ordering', 0.4784769909883374), ('cancelled', 0.47119974969542505), ('pack', 0.4566507281813944), ('delivered', 0.45351148583756845)]
Topic 6:
[('bitcoin', 0.8235475804294793), ('card', 0.7734286502423073), ('wallet', 0.6772588642347616), ('credit', 0.6731588060336892), ('coin', 0.5703668040987371), ('carding', 0.5529443276986676), ('btc', 0.5121844608207589), ('cash', 0.5037356917020909), ('debit', 0.500260454896595), ('coinbase', 0.49454000630077194)]
Topic -1:
[('new', 0.28398750337326484), ('free', 0.2771677713524054), ('ticket', 0.2699448449851029), ('help', 0.2697705189262906), ('update', 0.2675394807401724), ('account', 0.26547262677161937), ('mdma', 0.2638718211547908), ('vendor', 0.2588459510247759), ('dispute', 0.25440435619535773), ('need', 0.2488688355528112)]
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.6388389468193054 Davies_bouldin_score: 0.5523262827209047
best_indices = np.argsort(silhouette_scores)[-10:]
best_umap_embeddings = umap_embeddings[best_indices]
plt.figure(figsize=(10, 5))
plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', s=5)
plt.gca().set_aspect('equal', 'datalim')
plt.colorbar()
plt.title('UMAP projection of the topics with highest silhouette scores', fontsize=24)
plt.show()
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 5))
topic_model.update_topics(tc1.corpus, vectorizer_model=vectorizer_model)
topic_model.get_topic_freq()
| Topic | Count | |
|---|---|---|
| 0 | -1 | 34449 |
| 5 | 0 | 7495 |
| 2 | 1 | 6093 |
| 1 | 2 | 5034 |
| 7 | 3 | 4087 |
| 6 | 4 | 4003 |
| 3 | 5 | 2402 |
| 4 | 6 | 1966 |
print(topic_model.get_topic_info())
for topic_id in set(topics):
print(f"Topic {topic_id}:")
print(topic_model.get_topic(topic_id))
Topic Count Name \
0 -1 34449 -1_vendor_new_free_help
1 0 7495 0_weed_vendor_xanax_mg
2 1 6093 1_market_empire_dream_vendor
3 2 5034 2_vendor_scammer_scam_looking
4 3 4087 3_review_vendor review_vendor_review vendor
5 4 4003 4_mdma_lsd_good_got
6 5 2402 5_order_package_shipping_delivery
7 6 1966 6_card_bitcoin_wallet_credit
Representation \
0 [vendor, new, free, help, best, account, uk, u...
1 [weed, vendor, xanax, mg, cocaine, uk, best, c...
2 [market, empire, dream, vendor, nightmare, dre...
3 [vendor, scammer, scam, looking, scamming, exi...
4 [review, vendor review, vendor, review vendor,...
5 [mdma, lsd, good, got, shit, whats, guy, fuck,...
6 [order, package, shipping, delivery, vendor, p...
7 [card, bitcoin, wallet, credit, btc, carding, ...
Representative_Docs
0 [canadianflavor weed shatter cbd edible hash c...
1 [high quality weed thc product europe, new xan...
2 [next market, dream market vendor rstclass nig...
3 [looking good reliable vendor sell ounce, vend...
4 [empire vendor cocaine review, first ever revi...
5 [hey ro im gon na pull pk, life wonderful life...
6 [order accepted day still hasnt marked shipped...
7 [credit cards paypal prepaid card find, got cc...
Topic 0:
[('weed', 0.02425497350614531), ('vendor', 0.021978341010015688), ('xanax', 0.02077949072716719), ('mg', 0.01948517638840499), ('cocaine', 0.018417804414484252), ('uk', 0.015046793957699879), ('best', 0.013425752943917355), ('coke', 0.012717130457267087), ('ketamine', 0.01175969464362258), ('cannabis', 0.010948216683877144)]
Topic 1:
[('market', 0.09008978566905657), ('empire', 0.055274112551010335), ('dream', 0.04917325935832957), ('vendor', 0.024276714575283735), ('nightmare', 0.023605168431774765), ('dream market', 0.016025449931173885), ('empire market', 0.014646720705699409), ('new', 0.009033909010090109), ('nightmare market', 0.008867402221856543), ('scam', 0.006303868464254871)]
Topic 2:
[('vendor', 0.09965429794348642), ('scammer', 0.025788920958809015), ('scam', 0.017833603310448354), ('looking', 0.01337570071081538), ('scamming', 0.012208815488636926), ('exit', 0.011806364340026236), ('scammed', 0.008689720115543394), ('uk', 0.008678133768927804), ('good', 0.008493482524539575), ('warning', 0.008418582129949287)]
Topic 3:
[('review', 0.1428141634073404), ('vendor review', 0.058876246025626515), ('vendor', 0.05315846344525214), ('review vendor', 0.021049951157661017), ('review vendor review', 0.017406474951027713), ('review review', 0.015138695407876355), ('mg', 0.012888546716744416), ('mdma', 0.011146461993445255), ('sample', 0.010133356066428198), ('dream', 0.009783289767907996)]
Topic 4:
[('mdma', 0.011231558108969678), ('lsd', 0.009238251834183116), ('good', 0.007359917621616781), ('got', 0.006638868206622288), ('shit', 0.0065802885463340675), ('whats', 0.006051630264178851), ('guy', 0.005697866126116449), ('fuck', 0.005394916465354471), ('going', 0.005375411718474036), ('wsm', 0.0052967375805114646)]
Topic 5:
[('order', 0.09533424569336707), ('package', 0.025076372096897597), ('shipping', 0.02284913659637588), ('delivery', 0.018139605364174704), ('vendor', 0.014195026757439324), ('pack', 0.014024930561711633), ('tracking', 0.012976075064416448), ('shipped', 0.012741042718045418), ('ordering', 0.01153929794529684), ('time', 0.01087192180365464)]
Topic 6:
[('card', 0.04045581193563761), ('bitcoin', 0.03526436871145481), ('wallet', 0.02671909128748556), ('credit', 0.02286661027552805), ('btc', 0.0196385675748142), ('carding', 0.018970779081355412), ('coin', 0.016677548495845462), ('credit card', 0.014601870612078016), ('cash', 0.012420616388040553), ('bank', 0.010979756425111214)]
Topic -1:
[('vendor', 0.013820616851140987), ('new', 0.009152016420677532), ('free', 0.006913858221511509), ('help', 0.006453408973195096), ('best', 0.0060032500179123234), ('account', 0.005801364375676093), ('uk', 0.005664162822486113), ('update', 0.005547486073465391), ('crosspost', 0.005503646525948444), ('need', 0.00541678801673178)]
topic_model.visualize_topics()
topic_model.visualize_heatmap()
topic_model.visualize_hierarchy()
reduced_embeddings = UMAP(n_neighbors=15, n_components=2,
min_dist=0.0, metric='cosine').fit_transform(tc1.corpus_embeddings)
topic_model.visualize_documents(tc1.corpus, reduced_embeddings=reduced_embeddings,
hide_document_hover=True, hide_annotations=True)
topic_model.visualize_barchart()
new_topics = topic_model.reduce_outliers(tc1.corpus, topics, strategy="embeddings", embeddings=tc1.corpus_embeddings, threshold=0.53)
topic_model.update_topics(tc1.corpus, topics=new_topics)
topic_model.get_topic_info()
| Topic | Count | Name | Representation | Representative_Docs | |
|---|---|---|---|---|---|
| 0 | -1 | 27613 | -1_anyone_new_help_free | [anyone, new, help, free, please, update, tick... | [canadianflavor weed shatter cbd edible hash c... |
| 1 | 0 | 8645 | 0_weed_xanax_vendor_cocaine | [weed, xanax, vendor, cocaine, mg, uk, coke, b... | [high quality weed thc product europe, new xan... |
| 2 | 1 | 6236 | 1_market_empire_dream_nightmare | [market, empire, dream, nightmare, vendor, dre... | [next market, dream market vendor rstclass nig... |
| 3 | 2 | 6907 | 2_vendor_scammer_scam_looking | [vendor, scammer, scam, looking, scamming, sal... | [looking good reliable vendor sell ounce, vend... |
| 4 | 3 | 4230 | 3_review_vendor review_vendor_review vendor | [review, vendor review, vendor, review vendor,... | [empire vendor cocaine review, first ever revi... |
| 5 | 4 | 6299 | 4_mdma_lsd_get_looking | [mdma, lsd, get, looking, wsm, good, btc, ques... | [hey ro im gon na pull pk, life wonderful life... |
| 6 | 5 | 2776 | 5_order_package_shipping_delivery | [order, package, shipping, delivery, pack, shi... | [order accepted day still hasnt marked shipped... |
| 7 | 6 | 2823 | 6_bitcoin_card_wallet_btc | [bitcoin, card, wallet, btc, bank, credit, car... | [credit cards paypal prepaid card find, got cc... |
topic_model.visualize_documents(tc1.corpus, reduced_embeddings=reduced_embeddings,
hide_document_hover=True, hide_annotations=True)
topic_model.visualize_hierarchy()
topic_model.visualize_topics()
topic_model.visualize_barchart()
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(new_topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(new_topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.5083789229393005 Davies_bouldin_score: 0.7570962651091117
topic_words = topic_model.get_topics()
topics = [topic_words[i] for i in range(len(topic_words) - 1) if i != -1]
topn = 10
topic_list = []
for topic in topics:
topic_list.append([word for word, _ in topic[:topn]])
coherence_model = CoherenceModel(
topics=topic_list,
texts=[doc.split() for doc in tc1.corpus],
dictionary=corpora.Dictionary([doc.split() for doc in tc1.corpus]),
coherence='c_v'
)
print(f"Coherence Model: {coherence_model.get_coherence()}")
Coherence Model: 0.40058884901572617
df['name_thread'] = df['name_thread'].str.lower().dropna()
df.drop_duplicates(subset='name_thread', inplace=True)
df.dropna(subset=['name_thread'], inplace=True)
created_on = df['created_on'].tolist()
len(created_on)
topics_over_time = topic_model.topics_over_time(tc1.corpus, created_on,
global_tuning=True, evolution_tuning=True, nr_bins=100)
topic_model.visualize_topics_over_time(topics_over_time, width=1250, height=700)
indices = [index for index, topic in enumerate(new_topics) if topic != -1]
corpus_valid = [tc1.corpus[i] for i in indices]
created_on_valid = [created_on[i] for i in indices]
embeddings_valid = [tc1.corpus_embeddings[i] for i in indices]
topics_valid = [new_topics[i] for i in indices]
probs_valid = [probs[i] for i in indices]
results = pd.DataFrame({
'Document': corpus_valid,
'Embedding': embeddings_valid,
'Topic': topics_valid,
'Probability': probs_valid,
'Created_on': created_on_valid,
})
results_final = pd.merge(results, topic_model.get_topic_info(), on='Topic')
results_final['UMAP_embedding'] = list(X)
print(results_final.shape)
results_final.head()
(37916, 10)
| Document | Embedding | Topic | Probability | Created_on | Count | Name | Representation | Representative_Docs | UMAP_embedding | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | checks | [0.052164897, 0.029597273, -0.03666609, 0.0051... | 4 | 0.000000 | 2020-01-09 | 6299 | 4_mdma_lsd_get_looking | [mdma, lsd, get, looking, wsm, good, btc, ques... | [hey ro im gon na pull pk, life wonderful life... | [1.6488198, 9.914265, 1.442794, 2.8094368, -0.... |
| 1 | trusted vendor status | [0.02445144, -0.008732641, -0.0050215074, 0.01... | 2 | 0.944247 | 2020-01-09 | 6907 | 2_vendor_scammer_scam_looking | [vendor, scammer, scam, looking, scamming, sal... | [looking good reliable vendor sell ounce, vend... | [2.910516, 10.281041, 1.650234, 3.0320778, -0.... |
| 2 | empire exit scam iiflux user incomming | [0.02890829, 0.036081452, -0.027694924, -0.007... | 1 | 1.000000 | 2019-11-06 | 6236 | 1_market_empire_dream_nightmare | [market, empire, dream, nightmare, vendor, dre... | [next market, dream market vendor rstclass nig... | [1.5884036, 9.8587885, 3.3090453, 2.652358, 2.... |
| 3 | ecstasy vendor packs | [-0.022524439, 0.03949761, -0.023750877, 0.033... | 5 | 0.797741 | 2020-01-09 | 2776 | 5_order_package_shipping_delivery | [order, package, shipping, delivery, pack, shi... | [order accepted day still hasnt marked shipped... | [2.0245404, 10.517631, 2.3443217, 3.7595236, -... |
| 4 | opening bank account person fake id | [-0.029834118, 0.03354508, -0.012210185, -0.02... | 6 | 1.000000 | 2019-11-06 | 2823 | 6_bitcoin_card_wallet_btc | [bitcoin, card, wallet, btc, bank, credit, car... | [credit cards paypal prepaid card find, got cc... | [0.7278271, 9.884823, 1.8116106, 2.9336705, -0... |
topic_model.save("Models/topic_model_0.64SilNew", serialization='pickle')
results_final.to_parquet('ResultsBERTopic/BERTopic_nodefinedcluster_topics_15n_10com_1200cluster_0.64sil_renewout.parquet')
sns.histplot(results_final, x='Topic', discrete=True);
plt.pie(results_final.value_counts('Topic'), labels=results_final.value_counts('Topic').index, autopct='%1.1f%%');
500 min cluster size¶
seed_topic_list = [[
'tor site', 'drug', 'cocaine', 'ketamine', 'weed', 'trafficking', 'scammer', 'market', 'vendor', 'bitcoin',
'mdma', 'coke', 'lsd', 'heroine', 'xanax', 'tor node', 'tor site', 'gun', 'weapon', 'hacking'
]]
zeroshot_topic_list = [pd.read_csv('../../../intent_crime.csv')['intent'].tolist()]
representation_model = MaximalMarginalRelevance(diversity=0.3)
vectorizer_model = CountVectorizer(stop_words="english")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
umap_model = UMAP(n_neighbors=15, n_components=10, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=500, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
topic_model = BERTopic(
language='multilingual',
top_n_words=10,
n_gram_range=(1, 2),
umap_model=umap_model,
hdbscan_model=hdbscan_model,
seed_topic_list=seed_topic_list,
vectorizer_model=vectorizer_model,
ctfidf_model=ctfidf_model,
representation_model=representation_model,
zeroshot_topic_list=zeroshot_topic_list,
zeroshot_min_similarity=.05,
verbose=True
)
topics, probs = topic_model.fit_transform(tc1.corpus, tc1.corpus_embeddings)
topic_model.get_topic_info()
| Topic | Count | Name | Representation | Representative_Docs | |
|---|---|---|---|---|---|
| 0 | -1 | 28000 | -1_mdma_new_link_lsd | [mdma, new, link, lsd, help, free, vendor, nee... | [need high quality fake id check , big thanks ... |
| 1 | 0 | 4930 | 0_xanax_coke_cocaine_ketamine | [xanax, coke, cocaine, ketamine, mg, drug, pil... | [promo sale mg adderall ad xanax mg lsd mdma u... |
| 2 | 1 | 4469 | 1_bitcoin_card_bank_carding | [bitcoin, card, bank, carding, monero, wallet,... | [way cash bank log using btc, send bitcoin get... |
| 3 | 2 | 4227 | 2_dread_sub_lsd_shit | [dread, sub, lsd, shit, mdma, whats, guy, fuck... | [hey guy xangod man, let guy know dread host w... |
| 4 | 3 | 3702 | 3_market_dream_nightmare_dreammarket | [market, dream, nightmare, dreammarket, market... | [not order nightmare market, nightmare market ... |
| 5 | 4 | 3469 | 4_review_vendor_reviews_mg | [review, vendor, reviews, mg, vendymcvendface,... | [thclear ml purple kush vape cart review, vend... |
| 6 | 5 | 3410 | 5_order_package_pack_dispute | [order, package, pack, dispute, delivery, ship... | [package custom month love letter nothing, pac... |
| 7 | 6 | 2700 | 6_vendor_looking_seller_vendors | [vendor, looking, seller, vendors, buyer, lsd,... | [best vendor uk lsd, looking good vendor cc fu... |
| 8 | 7 | 1694 | 7_weed_cannabis_marijuana_hash | [weed, cannabis, marijuana, hash, quality, str... | [hash weed ship eu good vendor also usa, new i... |
| 9 | 8 | 1540 | 8_darknet_dark_web_sentenced | [darknet, dark, web, sentenced, drug, darkweb,... | [tacoma man sentenced four year dealing drugs ... |
| 10 | 9 | 1502 | 9_empire_dispute_deposit_empiremarket | [empire, dispute, deposit, empiremarket, scamm... | [empire next, give me empire, empire anyone else] |
| 11 | 10 | 1475 | 10_account_password_pgp_hacking | [account, password, pgp, hacking, hacked, secu... | [vendor enerygcontrolled hacked ca nt log pass... |
| 12 | 11 | 1314 | 11_tried_anybody_heard_ordered | [tried, anybody, heard, ordered, used, recentl... | [anybody heard pasitheas, anyone order recentl... |
| 13 | 12 | 1031 | 12_scammer_scam_exit_scamming | [scammer, scam, exit, scamming, warning, scamm... | [xangod scammer going exit scam proof, cottage... |
| 14 | 13 | 777 | 13_update_maintenance_updated_upgrade | [update, maintenance, updated, upgrade, vender... | [shipping update, update order, vendor update] |
| 15 | 14 | 681 | 14_ticket_support_deposit_month | [ticket, support, deposit, month, response, an... | [support ticket ticket, please help support ti... |
| 16 | 15 | 608 | 15_sample_samples_free_test | [sample, samples, free, test, testing, lab, te... | [xanax mg shipping free samples, new vendor fr... |
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.5718363523483276 Davies_bouldin_score: 0.6211900149809264
best_indices = np.argsort(silhouette_scores)[-10:]
best_umap_embeddings = umap_embeddings[best_indices]
unique_labels = np.unique(labels)
cmap = plt.cm.magma
plt.figure(figsize=(10, 5))
scatter = plt.scatter(X[:, 1], X[:, 2], c=labels, cmap=cmap, s=5)
plt.gca().set_aspect('equal', 'datalim')
norm = plt.Normalize(vmin=min(labels), vmax=max(labels))
handles = [plt.Line2D([0], [0], marker='o', color=cmap(norm(label)), linestyle='', markersize=10) for label in unique_labels]
legend_labels = [f'Class {label}' for label in unique_labels]
plt.legend(handles, legend_labels, title="Classes")
plt.colorbar(scatter, ticks=range(len(unique_labels)))
plt.title('UMAP projection of the topics with highest silhouette scores', fontsize=24)
plt.show()
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 5))
topic_model.update_topics(tc1.corpus, vectorizer_model=vectorizer_model)
topic_model.visualize_topics()
topic_model.visualize_heatmap()
topic_model.visualize_hierarchy()
reduced_embeddings = UMAP(n_neighbors=15, n_components=2,
min_dist=0.0, metric='cosine').fit_transform(tc1.corpus_embeddings)
topic_model.visualize_documents(tc1.corpus, reduced_embeddings=reduced_embeddings,
hide_document_hover=True, hide_annotations=True)
new_topics = topic_model.reduce_outliers(tc1.corpus, topics, strategy="embeddings", embeddings=tc1.corpus_embeddings, threshold=0.6)
topic_model.update_topics(tc1.corpus, topics=new_topics)
topic_model.get_topic_info()
| Topic | Count | Name | Representation | Representative_Docs | |
|---|---|---|---|---|---|
| 0 | -1 | 23928 | -1_new_vendor_help_uk | [new, vendor, help, uk, need, mdma, best, free... | [need high quality fake id check , big thanks ... |
| 1 | 0 | 5207 | 0_xanax_cocaine_mg_coke | [xanax, cocaine, mg, coke, ketamine, vendor, p... | [promo sale mg adderall ad xanax mg lsd mdma u... |
| 2 | 1 | 4512 | 1_bitcoin_card_bank_carding | [bitcoin, card, bank, carding, monero, wallet,... | [way cash bank log using btc, send bitcoin get... |
| 3 | 2 | 4944 | 2_dread_mdma_lsd_get | [dread, mdma, lsd, get, sub, shit, guy, lookin... | [hey guy xangod man, let guy know dread host w... |
| 4 | 3 | 3801 | 3_market_dream_nightmare_dream market | [market, dream, nightmare, dream market, vendo... | [not order nightmare market, nightmare market ... |
| 5 | 4 | 3706 | 4_review_vendor review_vendor_review vendor | [review, vendor review, vendor, review vendor,... | [thclear ml purple kush vape cart review, vend... |
| 6 | 5 | 3434 | 5_order_dispute_pack_package | [order, dispute, pack, package, shipping, deli... | [package custom month love letter nothing, pac... |
| 7 | 6 | 4123 | 6_vendor_vendor vendor_looking_best | [vendor, vendor vendor, looking, best, inquiry... | [best vendor uk lsd, looking good vendor cc fu... |
| 8 | 7 | 1848 | 7_weed_cannabis_uk_weed vendor | [weed, cannabis, uk, weed vendor, vendor, qual... | [hash weed ship eu good vendor also usa, new i... |
| 9 | 8 | 1557 | 8_darknet_dark_dark web_web | [darknet, dark, dark web, web, drug, sentenced... | [tacoma man sentenced four year dealing drugs ... |
| 10 | 9 | 1835 | 9_empire_empire market_market_empire empire | [empire, empire market, market, empire empire,... | [empire next, give me empire, empire anyone else] |
| 11 | 10 | 1542 | 10_account_pgp_password_vendor account | [account, pgp, password, vendor account, crypt... | [vendor enerygcontrolled hacked ca nt log pass... |
| 12 | 11 | 1394 | 11_anyone_has_has anyone_anybody | [anyone, has, has anyone, anybody, tried, anyo... | [anybody heard pasitheas, anyone order recentl... |
| 13 | 12 | 1398 | 12_scammer_scam_exit_scamming | [scammer, scam, exit, scamming, scammed, warni... | [xangod scammer going exit scam proof, cottage... |
| 14 | 13 | 826 | 13_update_maintenance_updated_update update | [update, maintenance, updated, update update, ... | [shipping update, update order, vendor update] |
| 15 | 14 | 682 | 14_ticket_support ticket_support_please | [ticket, support ticket, support, please, depo... | [support ticket ticket, please help support ti... |
| 16 | 15 | 792 | 15_sample_free_free sample_samples | [sample, free, free sample, samples, free samp... | [xanax mg shipping free samples, new vendor fr... |
topic_model.visualize_documents(tc1.corpus, reduced_embeddings=reduced_embeddings,
hide_document_hover=True, hide_annotations=True)
topic_model.visualize_topics()
topic_model.visualize_hierarchy()
topic_model.visualize_barchart(top_n_topics=16)
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(new_topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(new_topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.49986162781715393 Davies_bouldin_score: 0.7193546666619981
topic_words = topic_model.get_topics()
topics = [topic_words[i] for i in range(len(topic_words) - 1) if i != -1]
topn = 10
topic_list = []
for topic in topics:
topic_list.append([word for word, _ in topic[:topn]])
coherence_model = CoherenceModel(
topics=topic_list,
texts=[doc.split() for doc in tc1.corpus],
dictionary=corpora.Dictionary([doc.split() for doc in tc1.corpus]),
coherence='c_v'
)
print(f"Coherence Model: {coherence_model.get_coherence()}")
Coherence Model: 0.4902822303421074
indices = [index for index, topic in enumerate(new_topics) if topic != -1]
corpus_valid = [tc1.corpus[i] for i in indices]
created_on_valid = [created_on[i] for i in indices]
embeddings_valid = [tc1.corpus_embeddings[i] for i in indices]
topics_valid = [new_topics[i] for i in indices]
probs_valid = [probs[i] for i in indices]
results = pd.DataFrame({
'Document': corpus_valid,
'Embedding': embeddings_valid,
'Topic': topics_valid,
'Probability': probs_valid,
'Created_on': created_on_valid,
})
results_final = pd.merge(results, topic_model.get_topic_info(), on='Topic')
results_final['UMAP_embedding'] = list(X)
print(results_final.shape)
results_final.head()
results_final.to_parquet('ResultsBERTopic/BERTopic_nodefinedcluster_topics_15n_10com_500cluster_0.54sil_renewout.parquet')
(41601, 10)
topics_over_time = topic_model.topics_over_time(tc1.corpus, created_on,
global_tuning=True, evolution_tuning=True, nr_bins=100)
topic_model.visualize_topics_over_time(topics_over_time, width=1250, height=700)
plt.pie(results_final.value_counts('Topic'), labels=results_final.value_counts('Topic').index, autopct='%1.1f%%');
sns.histplot(results_final, x='Topic', discrete=True);
topic_model.save("Models/topic_model_0.50Sil300", serialization='pickle')
400 all-MiniLM-L6-v2¶
df = pd.read_csv('cleaned_data_name_thread.csv')
df = df.dropna(subset=['name_thread'])
df = df.drop_duplicates(subset=['name_thread'], keep='first')
df.shape[0]
66735
model = SentenceTransformer('all-MiniLM-L6-v2')
tc1 = ppt.TextClustering(df, 'name_thread')
tc1.encode_corpus(model, batch_size=64, to_tensor=False)
mmr = MaximalMarginalRelevance(diversity=0.3)
kw = KeyBERTInspired()
vectorizer_model = CountVectorizer(stop_words="english")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
umap_model = UMAP(n_neighbors=15, n_components=10, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=400, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
topic_model = BERTopic(
top_n_words=10,
n_gram_range=(1, 2),
umap_model=umap_model,
hdbscan_model=hdbscan_model,
vectorizer_model=vectorizer_model,
ctfidf_model=ctfidf_model,
representation_model=[mmr, kw],
embedding_model=model,
verbose=True
)
topics, probs = topic_model.fit_transform(tc1.corpus, tc1.corpus_embeddings)
print(topic_model.get_topic_info())
for topic_id in set(topics):
print(f"Topic {topic_id}:")
print(topic_model.get_topic(topic_id))
Topic Count Name \
0 -1 30941 -1_customer_buy_sale_buyer
1 0 5117 0_vape_shatter_carts_cartridge
2 1 2643 1_login_password_logged_error
3 2 2579 2_coca_opium_cocain_cocacolacompany
4 3 2124 3_xanaxlabs_xanaxlife_xanax_xanaxusa
5 4 1938 4_postal_usps_delivery_postage
6 5 1842 5_darkweb_darknetlive_darknetmarkets_sentenced
7 6 1721 6_empire_empiremarket_empireteam_empiredealer
8 7 1631 7_mdma_mdmamaster_pill_ecstasydata
9 8 1601 8_giftcard_card_giftcards_mastercard
10 9 1502 9_vendor_vendorpro_vendors_vendorbbmc
11 10 1417 10_scamming_scammer_scam_scammers
12 11 1126 11_counterfeiting_passport_counterfeit_fakeid
13 12 1072 12_dreammarket_nightmaremarket_market_dreams
14 13 979 13_lsd_tab_tabs_shrooms
15 14 739 14_monero_coinbase_coin_coins
16 15 676 15_review_reviewing_reviews_reviewer
17 16 674 16_pickledrick_heard_theoutfit_muttznutz
18 17 669 17_market_markets_marketplace_marketing
19 18 626 18_crosspost_deposting_goingpostal_vendors
20 19 603 19_deposit_depositing_deposits_ticket
21 20 573 20_pgpkey_pgp_pgps_pg
22 21 535 21_mod_moderator_dispute_disputes
23 22 450 22_cryptonia_cryptoniausers_cryptonians_cryptn...
24 23 445 23_wsm_wsms_vendorcp_machinerymint
25 24 443 24_ketamine_ketamin_ketamineking_ketaminekings
26 25 434 25_ticket_ticketmaster_ticketw_tickets
27 26 429 26_meth_methbusters_methamphetamine_crystal
Representation \
0 [customer, buy, sale, buyer, service, message,...
1 [vape, shatter, carts, cartridge, ounce, marij...
2 [login, password, logged, error, problem, log,...
3 [coca, opium, cocain, cocacolacompany, coke, c...
4 [xanaxlabs, xanaxlife, xanax, xanaxusa, xanaxr...
5 [postal, usps, delivery, postage, mail, delive...
6 [darkweb, darknetlive, darknetmarkets, sentenc...
7 [empire, empiremarket, empireteam, empiredeale...
8 [mdma, mdmamaster, pill, ecstasydata, mdmaus, ...
9 [giftcard, card, giftcards, mastercard, cards,...
10 [vendor, vendorpro, vendors, vendorbbmc, vendo...
11 [scamming, scammer, scam, scammers, scammed, s...
12 [counterfeiting, passport, counterfeit, fakeid...
13 [dreammarket, nightmaremarket, market, dreams,...
14 [lsd, tab, tabs, shrooms, acid, blotter, blott...
15 [monero, coinbase, coin, coins, cryptocurrency...
16 [review, reviewing, reviews, reviewer, reviewe...
17 [pickledrick, heard, theoutfit, muttznutz, hou...
18 [market, markets, marketplace, marketing, nonm...
19 [crosspost, deposting, goingpostal, vendors, c...
20 [deposit, depositing, deposits, ticket, deposi...
21 [pgpkey, pgp, pgps, pg, pgc, gnupg, key, gpg, ...
22 [mod, moderator, dispute, disputes, disputers,...
23 [cryptonia, cryptoniausers, cryptonians, crypt...
24 [wsm, wsms, vendorcp, machinerymint, wowza, pa...
25 [ketamine, ketamin, ketamineking, ketamineking...
26 [ticket, ticketmaster, ticketw, tickets, suppo...
27 [meth, methbusters, methamphetamine, crystal, ...
Representative_Docs
0 [dutchdrugz updates promo active till market p...
1 [sale girl scout cookie carts strains oz lb us...
2 [hey really could use help advice thanks, erro...
3 [colombian coke brazil ship world wide promoti...
4 [adderall mg ir adderall mg xanax super sale, ...
5 [informed delivery showing package, usa canada...
6 [three student arrested dark web drug traffick...
7 [empire anyone else, empire market back, empir...
8 [sale xtc pill mg mda us ca, uk mdma pill vend...
9 [carding amazon gift card, gift card prepaid d...
10 [nmm giving vendor runaround lying acting shad...
11 [market exit scam next, scam alert ukdrugdeale...
12 [buy counterfeit money real fake document, buy...
13 [dream market still, dream market, eleven drea...
14 [lsd blotter tab ug top quality, point one fre...
15 [looking best safe way buy large amount bitcoi...
16 [needing send sample bar trusted reviewer woul...
17 [anybody heard theoutfit, anybody heard pickle...
18 [market anyone else, market, currently working...
19 [envoy want crosspost, could vendor crosspost,...
20 [missing deposit double deposit please help, a...
21 [pgp public key, market pgp key, find pgp key]
22 [moderator dispute day, moderator please help ...
23 [cryptonia market, market king samsara crypton...
24 [wsm vendor, wsm back, wsm down]
25 [ketamine us, get ketamine, ketamine anyone]
26 [help support ticket please, help support tick...
27 [crystal meth uk, crystal meth, crystal meth v...
Topic 0:
[('vape', 0.4513024), ('shatter', 0.4508166), ('carts', 0.42475972), ('cartridge', 0.4150574), ('ounce', 0.38511506), ('marijuana', 0.3761327), ('cannabis', 0.37473193), ('edibles', 0.36946523), ('weed', 0.35874215), ('cart', 0.3494926)]
Topic 1:
[('login', 0.6874596), ('password', 0.58739483), ('logged', 0.44535103), ('error', 0.39473626), ('problem', 0.38404456), ('log', 0.3703017), ('account', 0.36962464), ('help', 0.36578366), ('trouble', 0.3579351), ('session', 0.34920555)]
Topic 2:
[('coca', 0.5442445), ('opium', 0.5241908), ('cocain', 0.48566723), ('cocacolacompany', 0.47682497), ('coke', 0.4701375), ('cocainehcl', 0.4403491), ('cocaine', 0.43470532), ('heroinfactory', 0.43406424), ('colombian', 0.40406665), ('cokemaster', 0.39702898)]
Topic 3:
[('xanaxlabs', 0.68098766), ('xanaxlife', 0.6694618), ('xanax', 0.64481914), ('xanaxusa', 0.5943617), ('xanaxring', 0.5927005), ('xanaxdepot', 0.5860753), ('xanaxdaddy', 0.57530177), ('xanaxblotters', 0.5676911), ('alprazolam', 0.5388765), ('xanaxinc', 0.5038374)]
Topic 4:
[('postal', 0.5783647), ('usps', 0.5671008), ('delivery', 0.552514), ('postage', 0.5435632), ('mail', 0.4794371), ('deliver', 0.46840727), ('package', 0.4595977), ('shipment', 0.4503156), ('shipping', 0.44325382), ('fedex', 0.44258836)]
Topic 5:
[('darkweb', 0.5460649), ('darknetlive', 0.47999817), ('darknetmarkets', 0.46108282), ('sentenced', 0.4581046), ('darknetmarketsnoobs', 0.4534067), ('darknet', 0.45285586), ('darkbay', 0.45059866), ('darkfail', 0.44140962), ('darkdotfail', 0.42702472), ('darknetaustralia', 0.42165762)]
Topic 6:
[('empire', 0.8657665), ('empiremarket', 0.8325376), ('empireteam', 0.7658358), ('empiredealer', 0.73584473), ('empires', 0.7089321), ('imperial', 0.59743464), ('imperialroyalty', 0.533589), ('market', 0.39446667), ('scammer', 0.3011508), ('nightmare', 0.29797795)]
Topic 7:
[('mdma', 0.57491755), ('mdmamaster', 0.55362886), ('pill', 0.54554516), ('ecstasydata', 0.54158187), ('mdmaus', 0.536477), ('mdacanada', 0.49906433), ('mda', 0.47733676), ('md', 0.47456974), ('ecstasy', 0.46981525), ('mg', 0.45221412)]
Topic 8:
[('giftcard', 0.68464833), ('card', 0.6067195), ('giftcards', 0.60337466), ('mastercard', 0.5686253), ('cards', 0.5325688), ('carding', 0.5214343), ('debit', 0.500812), ('carded', 0.49536285), ('carder', 0.48081687), ('cardable', 0.45047107)]
Topic 9:
[('vendor', 0.6717965), ('vendorpro', 0.64170885), ('vendors', 0.63945156), ('vendorbbmc', 0.6131782), ('vendorshop', 0.5619679), ('supplier', 0.4961744), ('shop', 0.43687624), ('inventory', 0.38063982), ('dealer', 0.37658587), ('trusted', 0.35675985)]
Topic 10:
[('scamming', 0.67339057), ('scammer', 0.64245546), ('scam', 0.6315777), ('scammers', 0.60618246), ('scammed', 0.5859374), ('scams', 0.5844768), ('exit', 0.38286078), ('ukdrugdealer', 0.37872887), ('warning', 0.35860184), ('confirmed', 0.3483911)]
Topic 11:
[('counterfeiting', 0.5351553), ('passport', 0.49532643), ('counterfeit', 0.48550797), ('fakeid', 0.46835682), ('forgery', 0.46821818), ('passports', 0.46553856), ('certificate', 0.46403533), ('fakeids', 0.36332572), ('licenses', 0.3491515), ('citizenship', 0.33687454)]
Topic 12:
[('dreammarket', 0.840524), ('nightmaremarket', 0.7301478), ('market', 0.679103), ('dreams', 0.5537206), ('nightmare', 0.54951864), ('dream', 0.52395815), ('dreaming', 0.51259714), ('nightmares', 0.5112673), ('dreamweaver', 0.4622426), ('deals', 0.4392535)]
Topic 13:
[('lsd', 0.6597349), ('tab', 0.4486916), ('tabs', 0.42244914), ('shrooms', 0.40983063), ('acid', 0.37709463), ('blotter', 0.3619333), ('blotters', 0.34030285), ('microdose', 0.31792137), ('dmt', 0.30784056), ('samspade', 0.306018)]
Topic 14:
[('monero', 0.66440576), ('coinbase', 0.6017641), ('coin', 0.58206344), ('coins', 0.55229485), ('cryptocurrency', 0.54781383), ('crypto', 0.5190888), ('bitcoin', 0.49815544), ('btc', 0.4951193), ('cryptocurrencies', 0.49073264), ('bitcoins', 0.48276216)]
Topic 15:
[('review', 0.7554549), ('reviewing', 0.70764035), ('reviews', 0.67082256), ('reviewer', 0.6707778), ('reviewed', 0.66799235), ('vendor', 0.3507808), ('post', 0.3232708), ('sample', 0.3039448), ('journal', 0.28708428), ('dankservices', 0.2783244)]
Topic 16:
[('pickledrick', 0.49188858), ('heard', 0.45528996), ('theoutfit', 0.4499943), ('muttznutz', 0.40856874), ('houseofdank', 0.38270152), ('purepharm', 0.3821613), ('thecandymanuk', 0.38004813), ('ndduk', 0.3797817), ('uzak', 0.37892848), ('turk', 0.37287065)]
Topic 17:
[('market', 0.9246511), ('markets', 0.82856095), ('marketplace', 0.66924006), ('marketing', 0.64059925), ('nonmarket', 0.63226146), ('undermarket', 0.5758176), ('traderoute', 0.5252505), ('farmersmarket', 0.51230544), ('demand', 0.48939776), ('trade', 0.4373095)]
Topic 18:
[('crosspost', 0.8023433), ('deposting', 0.54462177), ('goingpostal', 0.4369921), ('vendors', 0.3397432), ('courier', 0.31433263), ('tarred', 0.30136013), ('expose', 0.28236645), ('shop', 0.26232204), ('buyers', 0.25981808), ('weareamsterdam', 0.25617945)]
Topic 19:
[('deposit', 0.5940467), ('depositing', 0.54835135), ('deposits', 0.4703769), ('ticket', 0.4124618), ('deposited', 0.37039375), ('transaction', 0.32960162), ('btc', 0.29055083), ('fund', 0.28815228), ('unconfirmed', 0.28022093), ('twice', 0.27061075)]
Topic 20:
[('pgpkey', 0.78953433), ('pgp', 0.64266664), ('pgps', 0.60433674), ('pg', 0.57204497), ('pgc', 0.5202303), ('gnupg', 0.49523085), ('key', 0.4912796), ('gpg', 0.45877883), ('keys', 0.42667422), ('pgplogin', 0.40541986)]
Topic 21:
[('mod', 0.6461178), ('moderator', 0.6455801), ('dispute', 0.63188905), ('disputes', 0.53940743), ('disputers', 0.5393207), ('mods', 0.5271941), ('complaint', 0.47743487), ('modderator', 0.43813834), ('consensus', 0.3737623), ('handled', 0.37211758)]
Topic 22:
[('cryptonia', 0.82683897), ('cryptoniausers', 0.7519192), ('cryptonians', 0.7422215), ('cryptnonia', 0.6530852), ('cryptoni', 0.6209998), ('cryptoice', 0.5572725), ('market', 0.5073216), ('samasara', 0.42220467), ('samsera', 0.42188087), ('samsara', 0.3912958)]
Topic 23:
[('wsm', 0.8689953), ('wsms', 0.6338644), ('vendorcp', 0.41763154), ('machinerymint', 0.36969972), ('wowza', 0.36484522), ('paymwn', 0.32914096), ('maintenance', 0.31149185), ('greennz', 0.3085622), ('bionik', 0.30364022), ('bioniks', 0.30257553)]
Topic 24:
[('ketamine', 0.9532861), ('ketamin', 0.86957943), ('ketamineking', 0.8578399), ('ketaminekings', 0.8378519), ('ketaminehouse', 0.8028732), ('ketamax', 0.69982356), ('ketaconnect', 0.527894), ('tiletamine', 0.5001087), ('pyrimethamine', 0.48265585), ('pharmaceutical', 0.43739906)]
Topic 25:
[('ticket', 0.7282917), ('ticketmaster', 0.6860643), ('ticketw', 0.65911514), ('tickets', 0.62922376), ('support', 0.51385075), ('concert', 0.37351736), ('help', 0.29014573), ('assist', 0.28098187), ('fix', 0.27553594), ('outstanding', 0.27276954)]
Topic 26:
[('meth', 0.7546984), ('methbusters', 0.71206135), ('methamphetamine', 0.6617794), ('crystal', 0.6237694), ('methamph', 0.6163767), ('methoxetamine', 0.6146395), ('methadone', 0.58694017), ('dmethamphetamine', 0.5264992), ('methaqualone', 0.49982086), ('amphetamine', 0.49571955)]
Topic -1:
[('customer', 0.44219303), ('buy', 0.42263174), ('sale', 0.38992852), ('buyer', 0.38299185), ('service', 0.38183293), ('message', 0.37282392), ('update', 0.37055105), ('price', 0.37036857), ('paypal', 0.35097662), ('legit', 0.34381357)]
topic_model.get_topic_info()
| Topic | Count | Name | Representation | Representative_Docs | |
|---|---|---|---|---|---|
| 0 | -1 | 30941 | -1_customer_buy_sale_buyer | [customer, buy, sale, buyer, service, message,... | [dutchdrugz updates promo active till market p... |
| 1 | 0 | 5117 | 0_vape_shatter_carts_cartridge | [vape, shatter, carts, cartridge, ounce, marij... | [sale girl scout cookie carts strains oz lb us... |
| 2 | 1 | 2643 | 1_login_password_logged_error | [login, password, logged, error, problem, log,... | [hey really could use help advice thanks, erro... |
| 3 | 2 | 2579 | 2_coca_opium_cocain_cocacolacompany | [coca, opium, cocain, cocacolacompany, coke, c... | [colombian coke brazil ship world wide promoti... |
| 4 | 3 | 2124 | 3_xanaxlabs_xanaxlife_xanax_xanaxusa | [xanaxlabs, xanaxlife, xanax, xanaxusa, xanaxr... | [adderall mg ir adderall mg xanax super sale, ... |
| 5 | 4 | 1938 | 4_postal_usps_delivery_postage | [postal, usps, delivery, postage, mail, delive... | [informed delivery showing package, usa canada... |
| 6 | 5 | 1842 | 5_darkweb_darknetlive_darknetmarkets_sentenced | [darkweb, darknetlive, darknetmarkets, sentenc... | [three student arrested dark web drug traffick... |
| 7 | 6 | 1721 | 6_empire_empiremarket_empireteam_empiredealer | [empire, empiremarket, empireteam, empiredeale... | [empire anyone else, empire market back, empir... |
| 8 | 7 | 1631 | 7_mdma_mdmamaster_pill_ecstasydata | [mdma, mdmamaster, pill, ecstasydata, mdmaus, ... | [sale xtc pill mg mda us ca, uk mdma pill vend... |
| 9 | 8 | 1601 | 8_giftcard_card_giftcards_mastercard | [giftcard, card, giftcards, mastercard, cards,... | [carding amazon gift card, gift card prepaid d... |
| 10 | 9 | 1502 | 9_vendor_vendorpro_vendors_vendorbbmc | [vendor, vendorpro, vendors, vendorbbmc, vendo... | [nmm giving vendor runaround lying acting shad... |
| 11 | 10 | 1417 | 10_scamming_scammer_scam_scammers | [scamming, scammer, scam, scammers, scammed, s... | [market exit scam next, scam alert ukdrugdeale... |
| 12 | 11 | 1126 | 11_counterfeiting_passport_counterfeit_fakeid | [counterfeiting, passport, counterfeit, fakeid... | [buy counterfeit money real fake document, buy... |
| 13 | 12 | 1072 | 12_dreammarket_nightmaremarket_market_dreams | [dreammarket, nightmaremarket, market, dreams,... | [dream market still, dream market, eleven drea... |
| 14 | 13 | 979 | 13_lsd_tab_tabs_shrooms | [lsd, tab, tabs, shrooms, acid, blotter, blott... | [lsd blotter tab ug top quality, point one fre... |
| 15 | 14 | 739 | 14_monero_coinbase_coin_coins | [monero, coinbase, coin, coins, cryptocurrency... | [looking best safe way buy large amount bitcoi... |
| 16 | 15 | 676 | 15_review_reviewing_reviews_reviewer | [review, reviewing, reviews, reviewer, reviewe... | [needing send sample bar trusted reviewer woul... |
| 17 | 16 | 674 | 16_pickledrick_heard_theoutfit_muttznutz | [pickledrick, heard, theoutfit, muttznutz, hou... | [anybody heard theoutfit, anybody heard pickle... |
| 18 | 17 | 669 | 17_market_markets_marketplace_marketing | [market, markets, marketplace, marketing, nonm... | [market anyone else, market, currently working... |
| 19 | 18 | 626 | 18_crosspost_deposting_goingpostal_vendors | [crosspost, deposting, goingpostal, vendors, c... | [envoy want crosspost, could vendor crosspost,... |
| 20 | 19 | 603 | 19_deposit_depositing_deposits_ticket | [deposit, depositing, deposits, ticket, deposi... | [missing deposit double deposit please help, a... |
| 21 | 20 | 573 | 20_pgpkey_pgp_pgps_pg | [pgpkey, pgp, pgps, pg, pgc, gnupg, key, gpg, ... | [pgp public key, market pgp key, find pgp key] |
| 22 | 21 | 535 | 21_mod_moderator_dispute_disputes | [mod, moderator, dispute, disputes, disputers,... | [moderator dispute day, moderator please help ... |
| 23 | 22 | 450 | 22_cryptonia_cryptoniausers_cryptonians_cryptn... | [cryptonia, cryptoniausers, cryptonians, crypt... | [cryptonia market, market king samsara crypton... |
| 24 | 23 | 445 | 23_wsm_wsms_vendorcp_machinerymint | [wsm, wsms, vendorcp, machinerymint, wowza, pa... | [wsm vendor, wsm back, wsm down] |
| 25 | 24 | 443 | 24_ketamine_ketamin_ketamineking_ketaminekings | [ketamine, ketamin, ketamineking, ketamineking... | [ketamine us, get ketamine, ketamine anyone] |
| 26 | 25 | 434 | 25_ticket_ticketmaster_ticketw_tickets | [ticket, ticketmaster, ticketw, tickets, suppo... | [help support ticket please, help support tick... |
| 27 | 26 | 429 | 26_meth_methbusters_methamphetamine_crystal | [meth, methbusters, methamphetamine, crystal, ... | [crystal meth uk, crystal meth, crystal meth v... |
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.6434006690979004 Davies_bouldin_score: 0.4681034572960446
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 5))
topic_model.update_topics(tc1.corpus, vectorizer_model=vectorizer_model)
topic_model.visualize_topics()
topic_model.visualize_heatmap()
topic_model.visualize_hierarchy()
reduced_embeddings = UMAP(n_neighbors=15, n_components=2,
min_dist=0.0, metric='cosine').fit_transform(tc1.corpus_embeddings)
topic_model.visualize_documents(tc1.corpus, reduced_embeddings=reduced_embeddings,
hide_document_hover=True, hide_annotations=True)
new_topics = topic_model.reduce_outliers(tc1.corpus, topics, strategy="embeddings", embeddings=tc1.corpus_embeddings, threshold=0.5)
topic_model.update_topics(tc1.corpus, topics=new_topics)
topic_model.get_topic_info()
2024-06-27 14:34:02,549 - BERTopic - WARNING: Using a custom list of topic assignments may lead to errors if topic reduction techniques are used afterwards. Make sure that manually assigning topics is the last step in the pipeline.Note that topic embeddings will also be created through weightedc-TF-IDF embeddings instead of centroid embeddings.
| Topic | Count | Name | Representation | Representative_Docs | |
|---|---|---|---|---|---|
| 0 | -1 | 27323 | -1_anyone_vendor_order_review | [anyone, vendor, order, review, new, get, acco... | [dutchdrugz updates promo active till market p... |
| 1 | 0 | 5137 | 0_weed_cannabis_cart_review | [weed, cannabis, cart, review, thc, vendor, oz... | [sale girl scout cookie carts strains oz lb us... |
| 2 | 1 | 2700 | 1_help_login_need_account | [help, login, need, account, sub, back, passwo... | [hey really could use help advice thanks, erro... |
| 3 | 2 | 2601 | 2_cocaine_coke_heroin_drug | [cocaine, coke, heroin, drug, vendor, uk, best... | [colombian coke brazil ship world wide promoti... |
| 4 | 3 | 2270 | 3_xanax_mg_adderall_alprazolam | [xanax, mg, adderall, alprazolam, bar, diazepa... | [adderall mg ir adderall mg xanax super sale, ... |
| 5 | 4 | 2031 | 4_order_shipping_package_delivery | [order, shipping, package, delivery, shipped, ... | [informed delivery showing package, usa canada... |
| 6 | 5 | 1861 | 5_darknet_dark_tor_web | [darknet, dark, tor, web, onion, dark web, dar... | [three student arrested dark web drug traffick... |
| 7 | 6 | 1826 | 6_empire_empire market_empire empire_market | [empire, empire market, empire empire, market,... | [empire anyone else, empire market back, empir... |
| 8 | 7 | 1653 | 7_mdma_pill_mda_xtc | [mdma, pill, mda, xtc, mdma vendor, mg, usa, p... | [sale xtc pill mg mda us ca, uk mdma pill vend... |
| 9 | 8 | 1628 | 8_card_carding_cc_credit | [card, carding, cc, credit, cvv, credit card, ... | [carding amazon gift card, gift card prepaid d... |
| 10 | 9 | 3010 | 9_vendor_vendor vendor_inquiry_vendor inquiry | [vendor, vendor vendor, inquiry, vendor inquir... | [nmm giving vendor runaround lying acting shad... |
| 11 | 10 | 1741 | 10_scam_scammer_exit_scamming | [scam, scammer, exit, scamming, scammed, exit ... | [market exit scam next, scam alert ukdrugdeale... |
| 12 | 11 | 1147 | 11_counterfeit_id_fake_passport | [counterfeit, id, fake, passport, fake id, not... | [buy counterfeit money real fake document, buy... |
| 13 | 12 | 1202 | 12_dream_nightmare_dream market_market | [dream, nightmare, dream market, market, night... | [dream market still, dream market, eleven drea... |
| 14 | 13 | 1009 | 13_lsd_ug_tab_lsd vendor | [lsd, ug, tab, lsd vendor, acid, free, lsd tab... | [lsd blotter tab ug top quality, point one fre... |
| 15 | 14 | 854 | 14_monero_btc_bitcoin_coin | [monero, btc, bitcoin, coin, crypto, wallet, b... | [looking best safe way buy large amount bitcoi... |
| 16 | 15 | 926 | 15_review_vendor review_vendor_review vendor | [review, vendor review, vendor, review vendor,... | [needing send sample bar trusted reviewer woul... |
| 17 | 16 | 681 | 16_heard_anyone_anyone heard_happened | [heard, anyone, anyone heard, happened, has, h... | [anybody heard theoutfit, anybody heard pickle... |
| 18 | 17 | 989 | 17_market_market market_new market_new | [market, market market, new market, new, apoll... | [market anyone else, market, currently working... |
| 19 | 18 | 764 | 18_crosspost_review crosspost_crosspost vendor... | [crosspost, review crosspost, crosspost vendor... | [envoy want crosspost, could vendor crosspost,... |
| 20 | 19 | 671 | 19_deposit_deposited_ticket_address | [deposit, deposited, ticket, address, double, ... | [missing deposit double deposit please help, a... |
| 21 | 20 | 596 | 20_pgp_key_pgp key_public | [pgp, key, pgp key, public, public pgp, messag... | [pgp public key, market pgp key, find pgp key] |
| 22 | 21 | 551 | 21_dispute_dispute dispute_mod_moderator | [dispute, dispute dispute, mod, moderator, ple... | [moderator dispute day, moderator please help ... |
| 23 | 22 | 480 | 22_cryptonia_samsara_samsara market_cryptonia ... | [cryptonia, samsara, samsara market, cryptonia... | [cryptonia market, market king samsara crypton... |
| 24 | 23 | 485 | 23_wsm_wsm wsm_wsm vendor_vendor wsm | [wsm, wsm wsm, wsm vendor, vendor wsm, vendor,... | [wsm vendor, wsm back, wsm down] |
| 25 | 24 | 468 | 24_ketamine_ketamine vendor_mdma ketamine_keta... | [ketamine, ketamine vendor, mdma ketamine, ket... | [ketamine us, get ketamine, ketamine anyone] |
| 26 | 25 | 458 | 25_ticket_support ticket_support_please | [ticket, support ticket, support, please, mont... | [help support ticket please, help support tick... |
| 27 | 26 | 467 | 26_meth_crystal meth_crystal_meth vendor | [meth, crystal meth, crystal, meth vendor, met... | [crystal meth uk, crystal meth, crystal meth v... |
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
zero_shot_topics = pd.read_csv('../../../intent_crime.csv')['intent'].tolist()
dict_zero_shots_25 = ppt.assign_labels_to_topics(classifier, topic_model, zero_shot_topics, len(set(new_topics) - {-1}), threshold=.25)
dict_zero_shots_2 = ppt.assign_labels_to_topics(classifier, topic_model, zero_shot_topics, len(set(new_topics) - {-1}), threshold=.2)
dict_zero_shots_17 = ppt.assign_labels_to_topics(classifier, topic_model, zero_shot_topics, len(set(new_topics) - {-1}), threshold=.17)
dict_zero_shots_15 = ppt.assign_labels_to_topics(classifier, topic_model, zero_shot_topics, len(set(new_topics) - {-1}), threshold=.15)
dict_zero_shots_2[18] = 'crosspost vendor'
dict_zero_shots_2[22] = 'samsara market'
dict_zero_shots_2[23] = 'wsm market'
pd.DataFrame(list(dict_zero_shots_25.items()), columns=['Topic', 'Labels']).to_csv('ZeroShotClassificationResults/all-MiniLM-L6-v2_400/zero_shot_025.csv', index=False)
pd.DataFrame(list(dict_zero_shots_2.items()), columns=['Topic', 'Labels']).to_csv('ZeroShotClassificationResults/all-MiniLM-L6-v2_400/zero_shot_020.csv', index=False)
pd.DataFrame(list(dict_zero_shots_17.items()), columns=['Topic', 'Labels']).to_csv('ZeroShotClassificationResults/all-MiniLM-L6-v2_400/zero_shot_017.csv', index=False)
pd.DataFrame(list(dict_zero_shots_15.items()), columns=['Topic', 'Labels']).to_csv('ZeroShotClassificationResults/all-MiniLM-L6-v2_400/zero_shot_015.csv', index=False)
topic_model.set_topic_labels(dict_zero_shots_2)
topic_model.visualize_documents(tc1.corpus, reduced_embeddings=reduced_embeddings,
hide_document_hover=True, hide_annotations=True, custom_labels=True)
topic_model.visualize_hierarchy(custom_labels=True)
topic_model.visualize_topics()
topic_model.visualize_barchart(top_n_topics=25, custom_labels=True, n_words=10)
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(new_topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(new_topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.5175204277038574 Davies_bouldin_score: 0.7919422601150089
topic_words = topic_model.get_topics()
topics = [topic_words[i] for i in range(len(topic_words) - 1) if i != -1]
topn = 10
topic_list = []
for topic in topics:
topic_list.append([word for word, _ in topic[:topn]])
coherence_model = CoherenceModel(
topics=topic_list,
texts=[doc.split() for doc in tc1.corpus],
dictionary=corpora.Dictionary([doc.split() for doc in tc1.corpus]),
coherence='c_v'
)
print(f"Coherence Model: {coherence_model.get_coherence()}")
Coherence Model: 0.5751057167740472
df['name_thread'] = df['name_thread'].str.lower().dropna()
df.drop_duplicates(subset='name_thread', inplace=True)
df.dropna(subset=['name_thread'], inplace=True)
created_on = df['created_on'].tolist()
len(created_on)
topics_over_time = topic_model.topics_over_time(tc1.corpus, created_on,
global_tuning=True, evolution_tuning=True, nr_bins=100)
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10, width=1250, height=700, custom_labels=True)
15it [00:24, 1.62s/it]
indices = [index for index, topic in enumerate(new_topics) if topic != -1]
corpus_valid = [tc1.corpus[i] for i in indices]
created_on_valid = [created_on[i] for i in indices]
embeddings_valid = [tc1.corpus_embeddings[i] for i in indices]
topics_valid = [new_topics[i] for i in indices]
probs_valid = [probs[i] for i in indices]
results = pd.DataFrame({
'Document': corpus_valid,
'Embedding': embeddings_valid,
'Topic': topics_valid,
'Probability': probs_valid,
'Created_on': created_on_valid,
})
results_final = pd.merge(results, topic_model.get_topic_info(), on='Topic')
results_final['UMAP_embedding'] = list(X)
print(results_final.shape)
results_final.head()
(38274, 11)
| Document | Embedding | Topic | Probability | Created_on | Count | Name | CustomName | Representation | Representative_Docs | UMAP_embedding | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | review empire vendor acidbern | [-0.07762138, -0.049061198, -0.046745114, -0.0... | 6 | 0.527385 | 2020-01-09 | 1826 | 6_empire_empire market_empire empire_market | empire market | [empire, empire market, empire empire, market,... | [empire anyone else, empire market back, empir... | [9.086779, 3.6718397, 8.9006195, -1.1745992, 1... |
| 1 | vendor shipping combine priority | [-0.027722627, -0.0031221025, 0.01195772, -0.0... | 4 | 0.962274 | 2019-11-06 | 2031 | 4_order_shipping_package_delivery | order | [order, shipping, package, delivery, shipped, ... | [informed delivery showing package, usa canada... | [9.679236, 2.7164314, 8.733615, 0.011899776, 8... |
| 2 | open ticket since may ticket | [0.055031013, -0.018210536, -0.0026789573, -0.... | 25 | 1.000000 | 2020-01-09 | 458 | 25_ticket_support ticket_support_please | ticket support - ask help | [ticket, support ticket, support, please, mont... | [help support ticket please, help support tick... | [9.901975, 5.2703958, 11.463735, 0.47217792, 8... |
| 3 | vendor inquiry destroid dream | [-0.023196185, 0.0573189, 0.028408512, -0.0222... | 9 | 0.000000 | 2019-11-06 | 3010 | 9_vendor_vendor vendor_inquiry_vendor inquiry | inquiry - vendor vendor - vendor | [vendor, vendor vendor, inquiry, vendor inquir... | [nmm giving vendor runaround lying acting shad... | [9.912251, 4.028657, 7.623224, -0.7158077, 9.2... |
| 4 | morrison saver stamps uk money maker easiest m... | [-0.020903945, 0.050762244, -0.041445963, 0.01... | 11 | 0.799023 | 2020-01-09 | 1147 | 11_counterfeit_id_fake_passport | counterfeit money - fake IDs | [counterfeit, id, fake, passport, fake id, not... | [buy counterfeit money real fake document, buy... | [9.859931, 3.1459394, 9.145497, -1.0489817, 9.... |
topic_model.save("Models/topic_model_all-MiniLM-L6-v2_400", serialization="pickle", save_ctfidf=True, save_embedding_model=model)
results_final.to_parquet('ResultsBERTopic/BERTopic_all-MiniLM-L6-v2_400.parquet')
200 all-MiniLM-L6-v2¶
mmr = MaximalMarginalRelevance(diversity=0.3)
kw = KeyBERTInspired()
vectorizer_model = CountVectorizer(stop_words="english")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
umap_model = UMAP(n_neighbors=15, n_components=10, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=200, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
topic_model = BERTopic(
top_n_words=10,
n_gram_range=(1, 2),
umap_model=umap_model,
hdbscan_model=hdbscan_model,
vectorizer_model=vectorizer_model,
ctfidf_model=ctfidf_model,
representation_model=[mmr, kw],
embedding_model=model,
verbose=True
)
topics, probs = topic_model.fit_transform(tc1.corpus, tc1.corpus_embeddings)
print(topic_model.get_topic_info())
for topic_id in set(topics):
print(f"Topic {topic_id}:")
print(topic_model.get_topic(topic_id))
topic_model.get_topic_info()
| Topic | Count | Name | Representation | Representative_Docs | |
|---|---|---|---|---|---|
| 0 | -1 | 23926 | -1_opiateconnect_heinekenexpress_buy_dmt | [opiateconnect, heinekenexpress, buy, dmt, sho... | [good source dmt dream market, tramadol mg mg ... |
| 1 | 0 | 5394 | 0_carts_vape_cart_cannabis | [carts, vape, cart, cannabis, marijuana, straw... | [mg thc gummies cherry raspberry strawberry fl... |
| 2 | 1 | 2651 | 1_delivery_shipment_postage_delivered | [delivery, shipment, postage, delivered, posta... | [package say delivered po box never got, mail ... |
| 3 | 2 | 2204 | 2_vendor_vendors_vendorbbmc_trusted | [vendor, vendors, vendorbbmc, trusted, supplie... | [im looking vendor named buths bhuts shipping ... |
| 4 | 3 | 1951 | 3_guy_post_community_sub | [guy, post, community, sub, idea, posting, nig... | [sup fam ya boy ng min anybody legit right, hi... |
| 5 | 4 | 1715 | 4_empiremarket_empire_empiredealer_empireteam | [empiremarket, empire, empiredealer, empiretea... | [empire vendor, give me empire, top empire ven... |
| 6 | 5 | 1694 | 5_dreammarket_market_markets_nightmaremarket | [dreammarket, market, markets, nightmaremarket... | [miss dream ca nt use wallstreet market, wall ... |
| 7 | 6 | 1550 | 6_scamming_scammer_scam_scammers | [scamming, scammer, scam, scammers, scams, sca... | [cottageindustry possibly exit scamming select... |
| 8 | 7 | 1411 | 7_darkweb_sentenced_darknetmarketsnoobs_darkne... | [darkweb, sentenced, darknetmarketsnoobs, dark... | [darkweb vendor happytimes sentenced five year... |
| 9 | 8 | 1390 | 8_coca_cocain_cocacolacompany_cocainehcl | [coca, cocain, cocacolacompany, cocainehcl, co... | [review high purity colombian coke brazil, ful... |
| 10 | 9 | 1240 | 9_xanaxlabs_xanaxlife_xanax_xanaxcartel | [xanaxlabs, xanaxlife, xanax, xanaxcartel, xan... | [frankie new vendor mg real alprazolam xanax b... |
| 11 | 10 | 1130 | 10_mdma_mdmaus_mda_mdmamaster | [mdma, mdmaus, mda, mdmamaster, mdmamphetamine... | [per gram high purity mda promotion active sel... |
| 12 | 11 | 997 | 11_lsd_shrooms_tab_acid | [lsd, shrooms, tab, acid, tabs, psychedelic, m... | [point one lsd blotters lsd tab void realm tea... |
| 13 | 12 | 911 | 12_det_dere_igjen_en | [det, dere, igjen, en, privnote, kan, esrar, s... | [lever fortsatt valhalla noen som har en invit... |
| 14 | 13 | 813 | 13_monero_moneroatms_wallet_coin | [monero, moneroatms, wallet, coin, bitcoin, cr... | [buying coin anonymously needed monero, noob n... |
| 15 | 14 | 799 | 14_mastercard_card_usacards_cards | [mastercard, card, usacards, cards, carding, p... | [buying prepaid debit card btc eu, online card... |
| 16 | 15 | 687 | 15_tor_torguard_torbox_vpn | [tor, torguard, torbox, vpn, torstreet, vpns, ... | [configure tor browser disable javascript, use... |
| 17 | 16 | 659 | 16_crosspost_crossposting_goingpostal_crosspdf | [crosspost, crossposting, goingpostal, crosspd... | [lesson learnd googleplex saga prolific bar de... |
| 18 | 17 | 658 | 17_review_reviews_reviewer_reviewing | [review, reviews, reviewer, reviewing, reviewe... | [xpost danknation vendor review sunaero multis... |
| 19 | 18 | 636 | 18_marketplace_coremarket_market_markets | [marketplace, coremarket, market, markets, non... | [none marketplace link working, currently work... |
| 20 | 19 | 584 | 19_moderator_mod_dispute_disputee | [moderator, mod, dispute, disputee, disputers,... | [mod admin help dispute, dispute moderator ple... |
| 21 | 20 | 569 | 20_pgpkey_pgp_pgps_pg | [pgpkey, pgp, pgps, pg, key, gnupg, gpg, keys,... | [pgp public key, market pgp key, pgp key] |
| 22 | 21 | 568 | 21_deposit_depositing_deposits_deposited | [deposit, depositing, deposits, deposited, add... | [btc deposit issue ticket, missing deposit dou... |
| 23 | 22 | 539 | 22_passport_passports_fakeid_certificate | [passport, passports, fakeid, certificate, for... | [photoshop documents fakeid photo id address p... |
| 24 | 23 | 478 | 23_cryptonia_cryptoniausers_cryptonians_cryptn... | [cryptonia, cryptoniausers, cryptonians, crypt... | [cryptonia market, market king samsara crypton... |
| 25 | 24 | 468 | 24_wsm_wkr_whita_terpwax | [wsm, wkr, whita, terpwax, whachu, wowza, gree... | [back me wsm, wsm back, wsm vendor] |
| 26 | 25 | 447 | 25_bunk_bar_bars_selaminy | [bunk, bar, bars, selaminy, thegeniusbar, bars... | [selaminy bar review, bunk pack selaminy, sela... |
| 27 | 26 | 444 | 26_meth_methbusters_methamphetamine_methamph | [meth, methbusters, methamphetamine, methamph,... | [looking crystal meth, crystal meth uk, crysta... |
| 28 | 27 | 443 | 27_ketamine_ketamin_ketamineking_ketaminekings | [ketamine, ketamin, ketamineking, ketamineking... | [ketamine uk vendor, review ketamine, ketamine... |
| 29 | 28 | 438 | 28_ticket_ticketmaster_ticketing_ticketw | [ticket, ticketmaster, ticketing, ticketw, tic... | [support ticket open month, support ticket tic... |
| 30 | 29 | 416 | 29_counterfeitmoney_counterfeit_counterfeits_c... | [counterfeitmoney, counterfeit, counterfeits, ... | [find best usd counterfeit note, best counterf... |
| 31 | 30 | 415 | 30_login_logins_password_authentication | [login, logins, password, authentication, mult... | [password login disabled, login problem fa err... |
| 32 | 31 | 409 | 31_ecstasy_ecstasydata_pill_pillsexpress | [ecstasy, ecstasydata, pill, pillsexpress, pil... | [best ecstasy pill, samsung mg ecstasy pills u... |
| 33 | 32 | 409 | 32_hacking_hacker_hackerforhire_hackers | [hacking, hacker, hackerforhire, hackers, hack... | [job btc hacking service needed, looking profe... |
| 34 | 33 | 401 | 33_adderall_adderalls_adderal_adderallz | [adderall, adderalls, adderal, adderallz, adde... | [back mg adderall ir straight pharmacy brand n... |
| 35 | 34 | 392 | 34_tails_tail_wallet_monero | [tails, tail, wallet, monero, electrum, electr... | [electrum tail personal monero wallet, tails e... |
| 36 | 35 | 376 | 35_mushrooms_mushroommafia_mushroom_mushroomchick | [mushrooms, mushroommafia, mushroom, mushroomc... | [mushcanada free sample grams psilocybe cubens... |
| 37 | 36 | 369 | 36_xmr_xmrs_btc_lfwxmr | [xmr, xmrs, btc, lfwxmr, xmrto, btcoin, xmrtop... | [xmr btc empire, btc xmr, xmr btc xmr xmr] |
| 38 | 37 | 349 | 37_drugmarket_drugpics_drugs_drugsource | [drugmarket, drugpics, drugs, drugsource, drug... | [energy control international use and abuse of... |
| 39 | 38 | 344 | 38_dread_dreade_dreaddit_dreaddits | [dread, dreade, dreaddit, dreaddits, dreadadve... | [new dread since dream, dread back, dread well] |
| 40 | 39 | 315 | 39_withdraw_withdrawling_withdrawing_withdrawled | [withdraw, withdrawling, withdrawing, withdraw... | [made withdraw btc, withdrawal working stuck p... |
| 41 | 40 | 311 | 40_escrow_escrows_payment_multisignature | [escrow, escrows, payment, multisignature, mar... | [escrow, much escrow, full escrow] |
| 42 | 41 | 302 | 41_heroin_opium_heroinreview_heroinfactory | [heroin, opium, heroinreview, heroinfactory, h... | [liquidgold afghan burmese heroin sale extende... |
| 43 | 42 | 300 | 42_oxycodone_oxycocodone_oxicodone_oxycodon | [oxycodone, oxycocodone, oxicodone, oxycodon, ... | [mg oxycodone instant release supeudol origina... |
| 44 | 43 | 285 | 43_dnm_dmn_dnms_dnmrelated | [dnm, dmn, dnms, dnmrelated, dm, dwm, dnmsuper... | [dnm avenger link, new dnm order, call dnm ven... |
| 45 | 44 | 266 | 44_paypal_paypalshow_paypals_transfers | [paypal, paypalshow, paypals, transfers, trans... | [looking legit website bank western union payp... |
| 46 | 45 | 262 | 45_ddos_ddosd_attacks_attack | [ddos, ddosd, attacks, attack, ddosed, attacke... | [new ddos attack, attack ddos, ddos attack] |
| 47 | 46 | 252 | 46_fraud_fraudsters_fraudster_frauding | [fraud, fraudsters, fraudster, frauding, fraud... | [new fraud vendor, fraud vendor, fraud] |
| 48 | 47 | 247 | 47_benzoblotters_benzobuddies_benzos_benzo | [benzoblotters, benzobuddies, benzos, benzo, b... | [czech republic worldwide discreetlab selling ... |
| 49 | 48 | 231 | 48_apollonmarket_apollon_market_apollo | [apollonmarket, apollon, market, apollo, myste... | [mysteryland apollon market big promotion deal... |
| 50 | 49 | 230 | 49_phishing_phising_phish_phishy | [phishing, phising, phish, phishy, phissing, p... | [phishing warning, phishing link, warning empi... |
| 51 | 50 | 217 | 50_opsec_opsexy_opec_opspec | [opsec, opsexy, opec, opspec, opsecaholic, net... | [dream opsec, opsec, opsec question] |
| 52 | 51 | 213 | 51_mirror_mirrors_reflection_links | [mirror, mirrors, reflection, links, url, link... | [mirror link working, anyone working mirror li... |
| 53 | 52 | 212 | 52_links_link_pm_works | [links, link, pm, works, need, url, send, work... | [please pm someone working link, someone pm wo... |
| 54 | 53 | 207 | 53_fentanyl_fentantyl_carfentanyl_carfentanil | [fentanyl, fentantyl, carfentanyl, carfentanil... | [furanyl fentanyl fentanyl analogue eu, lookin... |
| 55 | 54 | 203 | 54_cgmc_invitation_ggmc_invite | [cgmc, invitation, ggmc, invite, cmc, gcmc, co... | [need cgmc invite code, cgmc invite code, invi... |
| 56 | 55 | 202 | 55_cvv_cvvs_ccv_cvvbilling | [cvv, cvvs, ccv, cvvbilling, cmv, ccs, vcc, cc... | [looking trusted cc cvv vendor, uk cc cvv vend... |
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.6660425662994385 Davies_bouldin_score: 0.3869296287979983
topic_model.reduce_topics(tc1.corpus, nr_topics='auto')
topics = topic_model.topics_
topic_model.get_topic_info()
| Topic | Count | Name | Representation | Representative_Docs | |
|---|---|---|---|---|---|
| 0 | -1 | 23926 | -1_heinekenexpress_dmt_opiateconnect_tramadol | [heinekenexpress, dmt, opiateconnect, tramadol... | [good source dmt dream market, need know start... |
| 1 | 0 | 11635 | 0_cannabis_sale_edibles_price | [cannabis, sale, edibles, price, weed, shippin... | [adderall mg pharma gram aaa indoor nugs ounce... |
| 2 | 1 | 4393 | 1_scamming_scammer_scam_scammed | [scamming, scammer, scam, scammed, scams, phis... | [sale customer scamming alert vendor, partysqu... |
| 3 | 2 | 2651 | 2_delivery_package_shipment_postage | [delivery, package, shipment, postage, shippin... | [package marked delivered never arrived, packa... |
| 4 | 3 | 1951 | 3_post_posting_advice_community | [post, posting, advice, community, newbie, que... | [sup fam ya boy ng min anybody legit right, hi... |
| 5 | 4 | 1715 | 4_empire_empiremarket_empiredealer_empireteam | [empire, empiremarket, empiredealer, empiretea... | [empire deposit support, empire now back, empi... |
| 6 | 5 | 1694 | 5_dreammarket_dream_dreams_dreaming | [dreammarket, dream, dreams, dreaming, nightma... | [new wall st use dream quick question, dream m... |
| 7 | 6 | 1411 | 6_darkweb_darkbay_darknetmarkets_sentenced | [darkweb, darkbay, darknetmarkets, sentenced, ... | [father son sentenced prison selling drugs dar... |
| 8 | 7 | 1130 | 7_mdma_mdmamaster_mdmaus_mda | [mdma, mdmamaster, mdmaus, mda, mdmamphetamine... | [best domestic mdma mda fast shipping tracked ... |
| 9 | 8 | 911 | 8_det_dere_je_nede | [det, dere, je, nede, noen, du, igjen, vous, s... | [hejlpe til finne ut av hva jeg har mottatt, z... |
| 10 | 9 | 813 | 9_monero_moneroatms_wallet_bitcoin | [monero, moneroatms, wallet, bitcoin, crypto, ... | [monero btc, noob need help buying bitcoin mon... |
| 11 | 10 | 799 | 10_mastercard_card_carder_carding | [mastercard, card, carder, carding, cards, car... | [credit score balance hq debit card fullz appl... |
| 12 | 11 | 687 | 11_tor_torguard_vpn_torbox | [tor, torguard, vpn, torbox, vpns, torshops, t... | [really safe using tor vpn, use vpn tor tails,... |
| 13 | 12 | 659 | 12_crosspost_goingpostal_posted_marketplace | [crosspost, goingpostal, posted, marketplace, ... | [someone posted witchman account crosspost, ma... |
| 14 | 13 | 658 | 13_review_reviewing_reviews_reviewed | [review, reviewing, reviews, reviewed, reviewf... | [review please, xpost danknation vendor review... |
| 15 | 14 | 636 | 14_coremarket_marketplace_markets_market | [coremarket, marketplace, markets, market, non... | [none marketplace link working, core marketpla... |
| 16 | 15 | 584 | 15_dispute_moderator_disputes_disputers | [dispute, moderator, disputes, disputers, mod,... | [moderator please help dispute, dispute modera... |
| 17 | 16 | 569 | 16_pgpkey_pgp_pgps_pg | [pgpkey, pgp, pgps, pg, key, gnupg, keys, gpg,... | [find pgp key, pgp key, vendor pgp key] |
| 18 | 17 | 568 | 17_deposit_depositing_deposits_deposited | [deposit, depositing, deposits, deposited, btc... | [generated deposit address deposited multiple ... |
| 19 | 18 | 539 | 18_passport_passports_fakeid_certificate | [passport, passports, fakeid, certificate, for... | [photoshop documents fakeid photo id address p... |
| 20 | 19 | 478 | 19_cryptonia_cryptonians_cryptoniausers_cryptn... | [cryptonia, cryptonians, cryptoniausers, crypt... | [cryptonia already, everyone move cryptonia ma... |
| 21 | 20 | 468 | 20_wsm_wsms_wkr_wxtra | [wsm, wsms, wkr, wxtra, whita, terpwax, whachu... | [back me wsm, wsm back, wsm vendor] |
| 22 | 21 | 447 | 21_bunk_bars_bar_barsbaby | [bunk, bars, bar, barsbaby, lonestarbars, theg... | [bunk pack selaminy, bunk bar, selaminy hulk b... |
| 23 | 22 | 443 | 22_ketamine_ketamineking_ketamin_ketaminekings | [ketamine, ketamineking, ketamin, ketamineking... | [review ketamine, ketamine review, ketamine us] |
| 24 | 23 | 438 | 23_ticket_ticketmaster_tickets_support | [ticket, ticketmaster, tickets, support, conce... | [support ticket support ticket, support ticket... |
| 25 | 24 | 416 | 24_counterfeit_counterfeitmoney_counterfeits_c... | [counterfeit, counterfeitmoney, counterfeits, ... | [counterfeit note, find best usd counterfeit n... |
| 26 | 25 | 415 | 25_login_logins_password_authentication | [login, logins, password, authentication, logg... | [password changed lost ca nt log, login proble... |
| 27 | 26 | 392 | 26_tails_tail_electrum_electrumtails | [tails, tail, electrum, electrumtails, electru... | [updated tail electrum issue setting gui moner... |
| 28 | 27 | 376 | 27_mushrooms_mushroommafia_mushroom_shrooms | [mushrooms, mushroommafia, mushroom, shrooms, ... | [mushcanada free sample grams psilocybe cubens... |
| 29 | 28 | 369 | 28_xmr_xmrs_lfwxmr_xmrto | [xmr, xmrs, lfwxmr, xmrto, btc, xmrtopy, xanxa... | [btc xmr, xmr btc, xmr btc xmr xmr] |
| 30 | 29 | 344 | 29_dread_dreaddit_dreaddits_dreadonion | [dread, dreaddit, dreaddits, dreadonion, dread... | [dread back, anything dread, dread well] |
| 31 | 30 | 315 | 30_withdraw_withdrawling_withdrawl_withdrawing | [withdraw, withdrawling, withdrawl, withdrawin... | [withdraw problem pending withdraw hour, withd... |
| 32 | 31 | 311 | 31_escrow_escrows_marketplace_payment | [escrow, escrows, marketplace, payment, commis... | [full escrow, escrow first, multisig escrow qu... |
| 33 | 32 | 285 | 32_dnm_dmn_dnms_dnmrelated | [dnm, dmn, dnms, dnmrelated, dm, dwm, dnmarket... | [new dnm first order question, dnm avenger lin... |
| 34 | 33 | 266 | 33_paypal_paypalshow_paypals_transfers | [paypal, paypalshow, paypals, transfers, trans... | [looking legit website bank western union payp... |
| 35 | 34 | 262 | 34_ddos_ddosd_attacks_attack | [ddos, ddosd, attacks, attack, ddosed, attacke... | [anything new nightmare ddos attack, ddos atta... |
| 36 | 35 | 252 | 35_fraud_fraudsters_fraudster_frauding | [fraud, fraudsters, fraudster, frauding, fraud... | [new fraud vendor, fraud vendor, fraud] |
| 37 | 36 | 247 | 36_benzoblotters_benzobuddies_benzos_benzo | [benzoblotters, benzobuddies, benzos, benzo, b... | [czech republic worldwide discreetlab selling ... |
| 38 | 37 | 231 | 37_apollonmarket_apollon_market_apollo | [apollonmarket, apollon, market, apollo, myste... | [mysteryland apollon market big promotion deal... |
| 39 | 38 | 217 | 38_opsec_opsexy_opspec_opec | [opsec, opsexy, opspec, opec, opsecaholic, net... | [dream opsec, opsec question, opsec] |
| 40 | 39 | 213 | 39_mirror_mirrors_empire_reflection | [mirror, mirrors, empire, reflection, working,... | [empire mirror working, anyone working mirror ... |
| 41 | 40 | 212 | 40_links_link_pm_works | [links, link, pm, works, need, url, working, s... | [working link please pm, please pm someone wor... |
| 42 | 41 | 203 | 41_cgmc_invitation_ggmc_invite | [cgmc, invitation, ggmc, invite, cmc, gcmc, co... | [need cgmc invite code, invite code cgmc, cgmc... |
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.34653472900390625 Davies_bouldin_score: 0.7209094786047956
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 5))
topic_model.update_topics(tc1.corpus, vectorizer_model=vectorizer_model)
topic_model.visualize_topics()
topic_model.visualize_hierarchy()
reduced_embeddings = UMAP(n_neighbors=15, n_components=2,
min_dist=0.0, metric='cosine').fit_transform(tc1.corpus_embeddings)
topic_model.visualize_documents(tc1.corpus, reduced_embeddings=reduced_embeddings,
hide_document_hover=True, hide_annotations=True)
new_topics = topic_model.reduce_outliers(tc1.corpus, topics, strategy="embeddings", embeddings=tc1.corpus_embeddings, threshold=0.6)
topic_model.update_topics(tc1.corpus, topics=new_topics)
topic_model.get_topic_info()
2024-06-28 14:20:51,371 - BERTopic - WARNING: Using a custom list of topic assignments may lead to errors if topic reduction techniques are used afterwards. Make sure that manually assigning topics is the last step in the pipeline.Note that topic embeddings will also be created through weightedc-TF-IDF embeddings instead of centroid embeddings.
| Topic | Count | Name | Representation | Representative_Docs | |
|---|---|---|---|---|---|
| 0 | -1 | 23556 | -1_vendor_anyone_review_new | [vendor, anyone, review, new, account, order, ... | [good source dmt dream market, need know start... |
| 1 | 0 | 11636 | 0_weed_xanax_lsd_review | [weed, xanax, lsd, review, cocaine, mg, vendor... | [adderall mg pharma gram aaa indoor nugs ounce... |
| 2 | 1 | 4422 | 1_vendor_scammer_scam_scamming | [vendor, scammer, scam, scamming, exit, phishi... | [sale customer scamming alert vendor, partysqu... |
| 3 | 2 | 2655 | 2_order_shipping_pack_package | [order, shipping, pack, package, delivery, shi... | [package marked delivered never arrived, packa... |
| 4 | 3 | 1952 | 3_help_guy_need_back | [help, guy, need, back, day, time, question, a... | [sup fam ya boy ng min anybody legit right, hi... |
| 5 | 4 | 1734 | 4_empire_empire market_empire empire_market | [empire, empire market, empire empire, market,... | [empire deposit support, empire now back, empi... |
| 6 | 5 | 1696 | 5_dream_dream market_nightmare_market | [dream, dream market, nightmare, market, walls... | [new wall st use dream quick question, dream m... |
| 7 | 6 | 1411 | 6_darknet_dark_web_dark web | [darknet, dark, web, dark web, darkfail, sente... | [father son sentenced prison selling drugs dar... |
| 8 | 7 | 1151 | 7_mdma_mdma vendor_mda_usa | [mdma, mdma vendor, mda, usa, sale, mdma revie... | [best domestic mdma mda fast shipping tracked ... |
| 9 | 8 | 911 | 8_anyone_heard_happened_de | [anyone, heard, happened, de, anyone heard, ha... | [hejlpe til finne ut av hva jeg har mottatt, z... |
| 10 | 9 | 824 | 9_monero_bitcoin_btc_coin | [monero, bitcoin, btc, coin, wallet, crypto, b... | [monero btc, noob need help buying bitcoin mon... |
| 11 | 10 | 816 | 10_carding_card_credit_credit card | [carding, card, credit, credit card, debit, pr... | [credit score balance hq debit card fullz appl... |
| 12 | 11 | 687 | 11_onion_tor_vpn_javascript | [onion, tor, vpn, javascript, browser, tor bro... | [really safe using tor vpn, use vpn tor tails,... |
| 13 | 12 | 683 | 12_crosspost_review crosspost_giveaway_review | [crosspost, review crosspost, giveaway, review... | [someone posted witchman account crosspost, ma... |
| 14 | 13 | 706 | 13_review_vendor review_review vendor_vendor | [review, vendor review, review vendor, vendor,... | [review please, xpost danknation vendor review... |
| 15 | 14 | 697 | 14_market_market market_new market_marketplace | [market, market market, new market, marketplac... | [none marketplace link working, core marketpla... |
| 16 | 15 | 585 | 15_dispute_moderator_mod_dispute dispute | [dispute, moderator, mod, dispute dispute, ple... | [moderator please help dispute, dispute modera... |
| 17 | 16 | 573 | 16_pgp_key_pgp key_public | [pgp, key, pgp key, public, public pgp, lost, ... | [find pgp key, pgp key, vendor pgp key] |
| 18 | 17 | 578 | 17_deposit_deposited_address_btc | [deposit, deposited, address, btc, btc deposit... | [generated deposit address deposited multiple ... |
| 19 | 18 | 540 | 18_id_fake_passport_fake id | [id, fake, passport, fake id, license, scan, d... | [photoshop documents fakeid photo id address p... |
| 20 | 19 | 482 | 19_cryptonia_samsara_samsara market_cryptonia ... | [cryptonia, samsara, samsara market, cryptonia... | [cryptonia already, everyone move cryptonia ma... |
| 21 | 20 | 485 | 20_wsm_wsm wsm_wsm vendor_vendor wsm | [wsm, wsm wsm, wsm vendor, vendor wsm, vendor,... | [back me wsm, wsm back, wsm vendor] |
| 22 | 21 | 449 | 21_bar_bunk_selaminy_bars | [bar, bunk, selaminy, bars, hulk, bunk bar, th... | [bunk pack selaminy, bunk bar, selaminy hulk b... |
| 23 | 22 | 445 | 22_ketamine_ketamine vendor_ketamine review_re... | [ketamine, ketamine vendor, ketamine review, r... | [review ketamine, ketamine review, ketamine us] |
| 24 | 23 | 440 | 23_ticket_support ticket_support_please | [ticket, support ticket, support, please, mont... | [support ticket support ticket, support ticket... |
| 25 | 24 | 430 | 24_counterfeit_euro_note_counterfeit euro | [counterfeit, euro, note, counterfeit euro, co... | [counterfeit note, find best usd counterfeit n... |
| 26 | 25 | 418 | 25_login_account_password_log | [login, account, password, log, fa, error, ca ... | [password changed lost ca nt log, login proble... |
| 27 | 26 | 393 | 26_tails_tail_electrum_wallet | [tails, tail, electrum, wallet, whonix, monero... | [updated tail electrum issue setting gui moner... |
| 28 | 27 | 377 | 27_mushroom_shrooms_mushrooms_magic | [mushroom, shrooms, mushrooms, magic, cubensis... | [mushcanada free sample grams psilocybe cubens... |
| 29 | 28 | 379 | 28_xmr_btc xmr_btc_xmrto | [xmr, btc xmr, btc, xmrto, xmr btc, xmr deposi... | [btc xmr, xmr btc, xmr btc xmr xmr] |
| 30 | 29 | 348 | 29_dread_dread dread_sub dread_new dread | [dread, dread dread, sub dread, new dread, sub... | [dread back, anything dread, dread well] |
| 31 | 30 | 325 | 30_withdraw_withdrawal_withdrawl_working | [withdraw, withdrawal, withdrawl, working, btc... | [withdraw problem pending withdraw hour, withd... |
| 32 | 31 | 320 | 31_escrow_multisig_full escrow_extend | [escrow, multisig, full escrow, extend, extend... | [full escrow, escrow first, multisig escrow qu... |
| 33 | 32 | 290 | 32_dnm_dnms_dn_dnstars | [dnm, dnms, dn, dnstars, dnmuk, avenger, dm, d... | [new dnm first order question, dnm avenger lin... |
| 34 | 33 | 271 | 33_paypal_transfer_paypal transfer_paypal account | [paypal, transfer, paypal transfer, paypal acc... | [looking legit website bank western union payp... |
| 35 | 34 | 264 | 34_ddos_ddos attack_attack_ddos ddos | [ddos, ddos attack, attack, ddos ddos, market,... | [anything new nightmare ddos attack, ddos atta... |
| 36 | 35 | 262 | 35_fraud_fraudsters_fraud vendor_loan fraud | [fraud, fraudsters, fraud vendor, loan fraud, ... | [new fraud vendor, fraud vendor, fraud] |
| 37 | 36 | 256 | 36_benzos_benzo_rc_benzo vendor | [benzos, benzo, rc, benzo vendor, rc benzos, r... | [czech republic worldwide discreetlab selling ... |
| 38 | 37 | 232 | 37_apollon_apollon market_market_mysteryland | [apollon, apollon market, market, mysteryland,... | [mysteryland apollon market big promotion deal... |
| 39 | 38 | 219 | 38_opsec_opsec question_opsec opsec_question | [opsec, opsec question, opsec opsec, question,... | [dream opsec, opsec question, opsec] |
| 40 | 39 | 215 | 39_mirror_working mirror_working_mirror link | [mirror, working mirror, working, mirror link,... | [empire mirror working, anyone working mirror ... |
| 41 | 40 | 213 | 40_link_working link_working_pm | [link, working link, working, pm, link please,... | [working link please pm, please pm someone wor... |
| 42 | 41 | 203 | 41_cgmc_invite_invite code_code | [cgmc, invite, invite code, code, cgmc invite,... | [need cgmc invite code, invite code cgmc, cgmc... |
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
zero_shot_topics = pd.read_csv('../../../intent_crime.csv')['intent'].tolist()
dict_zero_shots_25 = ppt.assign_labels_to_topics(classifier, topic_model, zero_shot_topics, len(set(new_topics) - {-1}), threshold=.25)
dict_zero_shots_2 = ppt.assign_labels_to_topics(classifier, topic_model, zero_shot_topics, len(set(new_topics) - {-1}), threshold=.2)
dict_zero_shots_25[1] = 'phishing - scamming'
dict_zero_shots_25[12] = 'crosspost vendor'
dict_zero_shots_25[19] = 'cryphtonia market'
dict_zero_shots_25[20] = 'wsm market'
dict_zero_shots_25[21] = 'bunk bar'
dict_zero_shots_25[31] = 'escrow service'
dict_zero_shots_25[39] = 'mirror link'
dict_zero_shots_25[40] = 'link'
dict_zero_shots_25[41] = 'cmgc - invite'
pd.DataFrame(list(dict_zero_shots_25.items()), columns=['Topic', 'Labels']).to_csv('ZeroShotClassificationResults/all-MiniLM-L6-v2_200/zero_shot_025.csv', index=False)
pd.DataFrame(list(dict_zero_shots_2.items()), columns=['Topic', 'Labels']).to_csv('ZeroShotClassificationResults/all-MiniLM-L6-v2_200/zero_shot_020.csv', index=False)
topic_model.set_topic_labels(dict_zero_shots_25)
topic_model.visualize_documents(tc1.corpus, reduced_embeddings=reduced_embeddings,
hide_document_hover=True, hide_annotations=True, custom_labels=True)
topic_model.visualize_barchart(top_n_topics=42, custom_labels=True, n_words=10)
topic_model.visualize_topics()
topic_model.visualize_hierarchy(custom_labels=True)
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.34653472900390625 Davies_bouldin_score: 0.7209094786047956
topic_words = topic_model.get_topics()
topics = [topic_words[i] for i in range(len(topic_words) - 1) if i != -1]
topn = 10
topic_list = []
for topic in topics:
topic_list.append([word for word, _ in topic[:topn]])
coherence_model = CoherenceModel(
topics=topic_list,
texts=[doc.split() for doc in tc1.corpus],
dictionary=corpora.Dictionary([doc.split() for doc in tc1.corpus]),
coherence='c_v'
)
print(f"Coherence Model: {coherence_model.get_coherence()}")
Coherence Model: 0.5765415205607421
df['name_thread'] = df['name_thread'].str.lower().dropna()
df.drop_duplicates(subset='name_thread', inplace=True)
df.dropna(subset=['name_thread'], inplace=True)
created_on = df['created_on'].tolist()
len(created_on)
65529
topics_over_time = topic_model.topics_over_time(tc1.corpus, created_on,
global_tuning=True, evolution_tuning=True, nr_bins=100)
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10, width=1250, height=700, custom_labels=True)
15it [00:25, 1.72s/it]
indices = [index for index, topic in enumerate(new_topics) if topic != -1]
corpus_valid = [tc1.corpus[i] for i in indices]
created_on_valid = [created_on[i] for i in indices]
embeddings_valid = [tc1.corpus_embeddings[i] for i in indices]
topics_valid = [new_topics[i] for i in indices]
probs_valid = [probs[i] for i in indices]
results = pd.DataFrame({
'Document': corpus_valid,
'Embedding': embeddings_valid,
'Topic': topics_valid,
'Probability': probs_valid,
'Created_on': created_on_valid,
})
results_final = pd.merge(results, topic_model.get_topic_info(), on='Topic')
print(results_final.shape)
results_final.head()
(41973, 10)
| Document | Embedding | Topic | Probability | Created_on | Count | Name | CustomName | Representation | Representative_Docs | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | retirement sale one last blowout mdma dry spee... | [-0.00200396, 0.060752388, 0.00081512495, -0.0... | 7 | 0.393620 | 2020-01-09 | 1151 | 7_mdma_mdma vendor_mda_usa | mdma - reviews vendor | [mdma, mdma vendor, mda, usa, sale, mdma revie... | [best domestic mdma mda fast shipping tracked ... |
| 1 | cash deposit | [-0.0044404618, 0.016640304, -0.035438363, 0.0... | 17 | 0.539291 | 2019-11-06 | 578 | 17_deposit_deposited_address_btc | deposit - address - deposited | [deposit, deposited, address, btc, btc deposit... | [generated deposit address deposited multiple ... |
| 2 | import meth contact tracking | [-0.05514505, -0.042183764, -0.060674116, -0.0... | 0 | 1.000000 | 2020-01-09 | 11636 | 0_weed_xanax_lsd_review | xanax - lsd - weed | [weed, xanax, lsd, review, cocaine, mg, vendor... | [adderall mg pharma gram aaa indoor nugs ounce... |
| 3 | please need working links | [0.013639548, -0.030973928, -0.05787297, 0.026... | 40 | 1.000000 | 2020-01-09 | 213 | 40_link_working link_working_pm | link | [link, working link, working, pm, link please,... | [working link please pm, please pm someone wor... |
| 4 | reliable dexedrine vendor | [-0.09150407, -0.024179617, 0.027147656, -0.06... | 0 | 0.404354 | 2020-01-09 | 11636 | 0_weed_xanax_lsd_review | xanax - lsd - weed | [weed, xanax, lsd, review, cocaine, mg, vendor... | [adderall mg pharma gram aaa indoor nugs ounce... |
plt.figure(figsize=(10, 5))
sns.countplot(results_final, x='Topic', orient='h');
topic_model.save("Models/topic_model_all-MiniLM-L6-v2_200", serialization="pickle", save_ctfidf=True, save_embedding_model=model)
2024-06-28 16:03:43,246 - BERTopic - WARNING: When you use `pickle` to save/load a BERTopic model,please make sure that the environments in which you saveand load the model are **exactly** the same. The version of BERTopic,its dependencies, and python need to remain the same.
topic_model.save("Models/topic_model_all-MiniLM-L6-v2_200_safetensors", serialization="safetensors", save_ctfidf=True, save_embedding_model=model)
results_final.to_parquet('ResultsBERTopic/BERTopic_all-MiniLM-L6-v2_200.parquet')
import nbconvert
!jupyter nbconvert --to html show_results.ipynb
20n 150 all-MiniLM-L6-v2¶
mmr = MaximalMarginalRelevance(diversity=0.3)
kw = KeyBERTInspired()
vectorizer_model = CountVectorizer(stop_words="english")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
umap_model = UMAP(n_neighbors=20, n_components=10, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=150, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
topic_model = BERTopic(
top_n_words=10,
n_gram_range=(1, 2),
umap_model=umap_model,
hdbscan_model=hdbscan_model,
vectorizer_model=vectorizer_model,
ctfidf_model=ctfidf_model,
representation_model=[mmr, kw],
embedding_model=model,
calculate_probabilities=True,
verbose=True
)
topics, probs = topic_model.fit_transform(tc1.corpus, tc1.corpus_embeddings)
2024-06-30 15:40:03,123 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm 2024-06-30 15:42:22,877 - BERTopic - Dimensionality - Completed ✓ 2024-06-30 15:42:22,905 - BERTopic - Cluster - Start clustering the reduced embeddings 2024-06-30 15:43:23,274 - BERTopic - Cluster - Completed ✓ 2024-06-30 15:43:23,475 - BERTopic - Representation - Extracting topics from clusters using representation models. 2024-06-30 15:44:03,645 - BERTopic - Representation - Completed ✓
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 5))
topic_model.update_topics(tc1.corpus, vectorizer_model=vectorizer_model)
topic_model.get_topic_info()
| Topic | Count | Name | Representation | Representative_Docs | |
|---|---|---|---|---|---|
| 0 | -1 | 24995 | -1_vendor_review_market_new | [vendor, review, market, new, order, account, ... | [review vendor cdnven product china white synt... |
| 1 | 0 | 4998 | 0_weed_cannabis_cart_thc | [weed, cannabis, cart, thc, review, oz, hash, ... | [product vendor review ml lemon kush wax vape ... |
| 2 | 1 | 1948 | 1_help_need_guy_sub | [help, need, guy, sub, day, question, post, ad... | [hi guy update post, way make sub like old red... |
| 3 | 2 | 1845 | 2_order_shipping_package_pack | [order, shipping, package, pack, delivery, shi... | [package show informed delivery, usps informed... |
| 4 | 3 | 1728 | 3_empire_empire market_empire empire_market | [empire, empire market, empire empire, market,... | [empire back, empire market back, empire suppo... |
| ... | ... | ... | ... | ... | ... |
| 64 | 63 | 191 | 63_dmt_dmt vendor_odsmt_dmt dmt | [dmt, dmt vendor, odsmt, dmt dmt, bluefairy, c... | [best dmt vendor, dmt, vendor dmt] |
| 65 | 64 | 186 | 64_captcha_rapture_rapture market_captcha captcha | [captcha, rapture, rapture market, captcha cap... | [captcha, use rapture registration login captc... |
| 66 | 65 | 181 | 65_chemical_research_research chemical_chems | [chemical, research, research chemical, chems,... | [chem theory honorable research chemical suppl... |
| 67 | 66 | 166 | 66_tor_browser_tor browser_tor network | [tor, browser, tor browser, tor network, netwo... | [tor browser help, review tor browser, tor bro... |
| 68 | 67 | 159 | 67_mephedrone_meopcp_mxe_mescaline | [mephedrone, meopcp, mxe, mescaline, mmc, meph... | [eurovalz new stock list mxe mephedrone availa... |
69 rows × 5 columns
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.6823439598083496 Davies_bouldin_score: 0.3791311194398217
new_topics = topic_model.reduce_outliers(tc1.corpus, topics, strategy="embeddings", embeddings=tc1.corpus_embeddings, threshold=0.46)
topic_model.update_topics(tc1.corpus, topics=new_topics)
topic_model.get_topic_info()
2024-06-30 16:01:57,452 - BERTopic - WARNING: Using a custom list of topic assignments may lead to errors if topic reduction techniques are used afterwards. Make sure that manually assigning topics is the last step in the pipeline.Note that topic embeddings will also be created through weightedc-TF-IDF embeddings instead of centroid embeddings.
| Topic | Count | Name | Representation | Representative_Docs | |
|---|---|---|---|---|---|
| 0 | -1 | 18900 | -1_anyone_get_update_review | [anyone, get, update, review, new, order, acco... | [review vendor cdnven product china white synt... |
| 1 | 0 | 5021 | 0_weed_cannabis_cart_thc | [weed, cannabis, cart, thc, review, hash, shat... | [product vendor review ml lemon kush wax vape ... |
| 2 | 1 | 2035 | 1_help_guy_sub_need | [help, guy, sub, need, back, question, day, ad... | [hi guy update post, way make sub like old red... |
| 3 | 2 | 2005 | 2_order_shipping_package_pack | [order, shipping, package, pack, delivery, shi... | [package show informed delivery, usps informed... |
| 4 | 3 | 1815 | 3_empire_empire market_empire empire_market | [empire, empire market, empire empire, market,... | [empire back, empire market back, empire suppo... |
| ... | ... | ... | ... | ... | ... |
| 64 | 63 | 202 | 63_dmt_dmt vendor_dmt vape_odsmt | [dmt, dmt vendor, dmt vape, odsmt, dmt dmt, ch... | [best dmt vendor, dmt, vendor dmt] |
| 65 | 64 | 187 | 64_captcha_rapture_rapture market_captcha captcha | [captcha, rapture, rapture market, captcha cap... | [captcha, use rapture registration login captc... |
| 66 | 65 | 198 | 65_chemical_research_research chemical_chems | [chemical, research, research chemical, chems,... | [chem theory honorable research chemical suppl... |
| 67 | 66 | 222 | 66_tor_tor browser_browser_tor network | [tor, tor browser, browser, tor network, netwo... | [tor browser help, review tor browser, tor bro... |
| 68 | 67 | 178 | 67_mephedrone_meopcp_mxe_mescaline | [mephedrone, meopcp, mxe, mescaline, mmc, meph... | [eurovalz new stock list mxe mephedrone availa... |
69 rows × 5 columns
for topic_id in set(topics):
print(f"Topic {topic_id}:")
print(topic_model.get_topic(topic_id))
Topic 0:
[('weed', 0.034440655421142205), ('cannabis', 0.02163153800597487), ('cart', 0.02131043599512726), ('thc', 0.01824969706580151), ('review', 0.014699009276947663), ('hash', 0.013941473812216146), ('shatter', 0.01385956329618397), ('distillate', 0.013788654857474974), ('oz', 0.013699156416790433), ('bud', 0.013511109713200431)]
Topic 1:
[('help', 0.020902783340739296), ('guy', 0.018141447515630817), ('sub', 0.018061502588594965), ('need', 0.017228855503283312), ('back', 0.015478023601901827), ('question', 0.014588537674805049), ('day', 0.014103368936720316), ('advice', 0.013364282528612713), ('shit', 0.01315161003792278), ('post', 0.013047995251142981)]
Topic 2:
[('order', 0.05141015231966171), ('shipping', 0.049459163735666216), ('package', 0.034053136968648846), ('pack', 0.03022209818441062), ('delivery', 0.02795081519396416), ('shipped', 0.02329856657883124), ('ship', 0.018460022471923705), ('usps', 0.01791841479902273), ('delivered', 0.013805279333056695), ('international', 0.013489901890926527)]
Topic 3:
[('empire', 0.1408905571475779), ('empire market', 0.047832772840589455), ('empire empire', 0.031829806331982104), ('market', 0.022805110300898157), ('deposit', 0.013311333177807824), ('vendor empire', 0.013156717730784043), ('market empire', 0.011089067660616526), ('ticket', 0.01074427553045342), ('empire vendor', 0.010698545017112069), ('link', 0.010681298135100263)]
Topic 4:
[('vendor', 0.07352353126224266), ('vendor vendor', 0.027793634237021102), ('vendor inquiry', 0.01767525614277819), ('inquiry', 0.017541981259577955), ('new vendor', 0.016785192087489506), ('new', 0.014829319198509949), ('looking', 0.012972691707067285), ('trusted', 0.01072558646684358), ('looking vendor', 0.010710900710690481), ('legit', 0.009197289255501064)]
Topic 5:
[('scammer', 0.09029491748260987), ('scam', 0.08790776767015127), ('exit', 0.05409099855154675), ('scamming', 0.05345669284651443), ('scammed', 0.04387510350103406), ('exit scam', 0.03795962281948466), ('selective', 0.026166507631807726), ('exit scamming', 0.02118137287516553), ('alert', 0.018586128265137065), ('warning', 0.016658284455618223)]
Topic 6:
[('darknet', 0.06574314894703666), ('dark', 0.05575058109762389), ('web', 0.04056458466112466), ('dark web', 0.03888520120543353), ('darkfail', 0.0326590485356795), ('darkweb', 0.022966993332759263), ('sentenced', 0.02278933896007672), ('drug', 0.01891860040084595), ('prison', 0.016267179639628225), ('dark net', 0.01581062364638907)]
Topic 7:
[('mdma', 0.15108386628760112), ('mda', 0.0429162477081238), ('mdma vendor', 0.034000537140433544), ('domestic', 0.0190830757862918), ('usa', 0.018739701436677143), ('us', 0.0185565173484347), ('mdma review', 0.016368695282717644), ('domestic mdma', 0.015310305078250312), ('sale', 0.01522592122356552), ('mda powder', 0.014536706585474362)]
Topic 8:
[('xanax', 0.14902915079054058), ('mg', 0.037073236799781265), ('diazepam', 0.030553350154839825), ('xanax vendor', 0.03030524528382461), ('valium', 0.02763017540016205), ('mg xanax', 0.024213410884215428), ('bar', 0.023464625343629694), ('xanax bar', 0.02288490630780396), ('xanax mg', 0.020390790169055237), ('bars', 0.01626062009535231)]
Topic 9:
[('lsd', 0.1734647507288589), ('ug', 0.04747007330538135), ('tab', 0.04142689902020762), ('lsd vendor', 0.04039618697559746), ('acid', 0.022771452107687707), ('lsd tab', 0.022084975534138424), ('gammagoblin', 0.020282120388494473), ('free', 0.018343364070410467), ('usa', 0.016690416345776463), ('mdma', 0.016209423140569452)]
Topic 10:
[('crosspost', 0.14470709618860572), ('giveaway', 0.04202963247681046), ('review crosspost', 0.02057311906846835), ('crosspost vendor', 0.01589126119057147), ('review', 0.015116522044644013), ('crosspost review', 0.01268557574967833), ('crosspost new', 0.010364520318510435), ('winner', 0.010310623869567707), ('vendor crosspost', 0.010105053266291379), ('envoy', 0.00986726589013657)]
Topic 11:
[('monero', 0.09836195098914442), ('btc', 0.06363293656791628), ('bitcoin', 0.05875105248934212), ('coin', 0.0400067415743657), ('wallet', 0.03272232748466129), ('crypto', 0.031144635815675798), ('buying', 0.023400549290578096), ('buy', 0.022725372561839517), ('way', 0.019920869234552413), ('anonymously', 0.017395228092262436)]
Topic 12:
[('carding', 0.13434099717006076), ('card', 0.12768798940084336), ('credit', 0.03905087805207301), ('credit card', 0.037218429781870184), ('debit', 0.031090671713331016), ('gift', 0.026719556198699018), ('debit card', 0.02623880953133958), ('gift card', 0.025296889224605497), ('prepaid', 0.02390841050419752), ('cards', 0.02234112624489392)]
Topic 13:
[('dream', 0.16188938874288553), ('dream market', 0.0769761528601036), ('nightmare', 0.057569364500263495), ('market', 0.04348747836116363), ('dream dream', 0.03160064455881955), ('nightmare market', 0.015907385449258748), ('market dream', 0.015385428217323891), ('dream nightmare', 0.015124657030832909), ('anyone', 0.013442507797909524), ('dream alt', 0.013284958959525854)]
Topic 14:
[('dispute', 0.19114992246810456), ('mod', 0.06456174249994076), ('moderator', 0.04459012311206817), ('dispute dispute', 0.03637392127596315), ('please', 0.033520993517153756), ('help', 0.02470584993561166), ('help dispute', 0.021362067564137565), ('resolved', 0.020927566860518908), ('admin', 0.020875348152861174), ('dispute vendor', 0.020570879876576915)]
Topic 15:
[('cocaine', 0.19185759141507205), ('cocaine vendor', 0.033820265777428325), ('fishscale', 0.03184970819993608), ('peruvian', 0.02886221748054375), ('colombian', 0.025258187463892433), ('uncut', 0.02489998326649046), ('pure', 0.02338967723120477), ('fishscale cocaine', 0.02328527599852838), ('ukwhite', 0.02053657782269459), ('cocaine review', 0.019696278673770187)]
Topic 16:
[('review', 0.13479635045612737), ('vendor review', 0.09882182412820924), ('review vendor', 0.056146396360581954), ('vendor', 0.042324514725933904), ('review review', 0.026492179649533248), ('feedback', 0.018191404917665587), ('reviews', 0.017732395258446845), ('review template', 0.012447521444075921), ('sample', 0.012252829983770771), ('template', 0.010912428935692265)]
Topic 17:
[('market', 0.13229049891457828), ('market market', 0.02912990577253405), ('new market', 0.02733587453280759), ('markets', 0.018260042378394587), ('marketplace', 0.018237246649367742), ('new', 0.01653915451535478), ('core', 0.01391739463236357), ('grey market', 0.012662397716895505), ('core market', 0.012656085780194134), ('grey', 0.01194181520974549)]
Topic 18:
[('pgp', 0.2106338648158133), ('key', 0.13747067284852693), ('pgp key', 0.11633357860089025), ('public', 0.03390884077311098), ('public pgp', 0.029752434238696095), ('message', 0.02634991063313772), ('lost', 0.02000184309357757), ('vendor pgp', 0.01994359114136673), ('decrypt', 0.01894570485870717), ('lost pgp', 0.018617781813871224)]
Topic 19:
[('deposit', 0.21545306050516058), ('deposited', 0.058649535773789216), ('ticket', 0.04179694434525314), ('address', 0.03870905816314492), ('double', 0.03846549616753392), ('double deposit', 0.03667999808480363), ('deposit address', 0.03156203629282908), ('btc', 0.03071204186414341), ('btc deposit', 0.029610323600547078), ('deposit issue', 0.026879010967503388)]
Topic 20:
[('bar', 0.11981212043710264), ('bunk', 0.04275809433364427), ('bars', 0.04134049952006931), ('selaminy', 0.04065510251937409), ('hulk', 0.03808720407648988), ('xmf', 0.032625152153394094), ('xanmasterfrank', 0.024794638230597937), ('thebartender', 0.02146829100887259), ('bunk bar', 0.021104670137899736), ('pack', 0.02109839573855679)]
Topic 21:
[('oxycodone', 0.0915503419563695), ('mg', 0.0710126695185631), ('oxy', 0.06895163961328402), ('opiate', 0.045863554052605914), ('opiateconnect', 0.04453309489080648), ('oxycodone mg', 0.032572918296594006), ('opioids', 0.02865321440258756), ('oxycontin', 0.027684562577623886), ('morphine', 0.026473417513483474), ('mg mg', 0.022411097043153062)]
Topic 22:
[('id', 0.1385509849376173), ('passport', 0.08568772643406097), ('fake', 0.08482410071909122), ('fake id', 0.07918133670896313), ('license', 0.0649065274454038), ('scan', 0.047816586940883546), ('driver', 0.03623643897437548), ('driving', 0.03147360609206778), ('driver license', 0.029977190765821365), ('dl', 0.02565984346716325)]
Topic 23:
[('drug', 0.15474318847411217), ('drugsuk', 0.03997464930394203), ('drugs', 0.035767234032706756), ('selling drug', 0.013706118772916276), ('drug dealer', 0.01363616498515454), ('pharma', 0.011535993702901273), ('anyone', 0.011523457595840088), ('online', 0.011292374800951103), ('drug checking', 0.01121409717784059), ('drug market', 0.010871004656672078)]
Topic 24:
[('coke', 0.24480516496483845), ('coke vendor', 0.07247842001502117), ('best coke', 0.04428189107711554), ('uk coke', 0.0308130119571335), ('uk', 0.027295920632150066), ('good coke', 0.023893359833443418), ('best', 0.02301379158858306), ('coke review', 0.020957440344420427), ('domestic coke', 0.02013314071553494), ('cola', 0.018480674314440285)]
Topic 25:
[('pill', 0.13687679139191483), ('xtc', 0.0689429271115277), ('xtc pill', 0.05785884094585473), ('ecstasy', 0.049586539528066305), ('pills', 0.04890054928026709), ('mg', 0.03486535976865647), ('xtc pills', 0.025764845512285212), ('pill press', 0.024698668416445056), ('press', 0.02251824475187729), ('pillchills', 0.021667648535163505)]
Topic 26:
[('counterfeit', 0.18938590987283574), ('note', 0.09246239331422364), ('euro', 0.08637948498220155), ('money', 0.04200643406185359), ('counterfeit money', 0.03946683058559953), ('counterfeit euro', 0.038261536147981785), ('fake', 0.03268594490209314), ('bill', 0.03208617920470757), ('counterfeit note', 0.03202031699172969), ('currency', 0.030601930905961253)]
Topic 27:
[('ketamine', 0.27076491836153055), ('ketamine vendor', 0.05465820126593129), ('ketamine review', 0.034236075793543305), ('mdma ketamine', 0.026958811216225554), ('review ketamine', 0.023265028523820383), ('racemic', 0.021840213597887684), ('ketamine ketamine', 0.0209361890500582), ('review', 0.02035840901808624), ('usaconnect', 0.019703435870339864), ('domestic ketamine', 0.019540443113387658)]
Topic 28:
[('wsm', 0.26062962876168194), ('wsm wsm', 0.0405533967559985), ('wsm vendor', 0.03762271698792374), ('vendor wsm', 0.03206339997224917), ('wsm order', 0.017737920836218876), ('dream wsm', 0.0163160597194279), ('wsm exit', 0.016188508790288752), ('vendor', 0.015836552381021), ('order wsm', 0.015192151060626139), ('exit', 0.014335338467236433)]
Topic 29:
[('meth', 0.22276036172115224), ('crystal meth', 0.05328948275951995), ('crystal', 0.04915146873849497), ('meth vendor', 0.04802431132793452), ('methamphetamine', 0.03725379840439993), ('best meth', 0.026685331810131725), ('speed', 0.01638305099194758), ('vendor', 0.015045674869553384), ('meth review', 0.01449593744356139), ('meth speed', 0.013462093823955613)]
Topic 30:
[('ticket', 0.23191906538627902), ('support ticket', 0.13004355933812262), ('support', 0.11743036604119221), ('please', 0.045630976919213126), ('ticket support', 0.03775927767239459), ('month', 0.03625628502501229), ('response', 0.029074496373647844), ('help', 0.028793658434030697), ('ticket please', 0.027419804082700512), ('ticket ticket', 0.024956763887226712)]
Topic 31:
[('hacked', 0.07268774210946359), ('hacker', 0.05958968811663258), ('hacking', 0.05746148496961), ('job', 0.042647311606897144), ('lfw', 0.035572613614792414), ('malware', 0.03096845695427884), ('hack', 0.025379278779315104), ('exploit', 0.02510108738610593), ('account', 0.024285044626452958), ('developer', 0.022590978647495333)]
Topic 32:
[('login', 0.09281872111514107), ('account', 0.07744354819167176), ('password', 0.06405732928260777), ('log', 0.05534837243822272), ('fa', 0.053826774653849), ('error', 0.03207318072187401), ('registration', 0.026153382171231136), ('ca nt', 0.024842545357693684), ('nt', 0.024626489071347155), ('username', 0.02415273192532678)]
Topic 33:
[('adderall', 0.1768984328457466), ('mg', 0.06327189728144145), ('ir', 0.04289049261783819), ('ritalin', 0.042675061609971), ('vyvanse', 0.04215927084997145), ('mg adderall', 0.04139982170401677), ('adderall mg', 0.040035809578354675), ('pharmacy', 0.03187900435840742), ('adderall vendor', 0.030260415441556283), ('brand name', 0.029564246899279608)]
Topic 34:
[('xmr', 0.24399148967723583), ('btc xmr', 0.06444934680865366), ('btc', 0.060770387170328795), ('xmrto', 0.04594811650627081), ('xmr btc', 0.03659618863655146), ('xmr deposit', 0.029697291143319954), ('monero', 0.024426574198576537), ('monero xmr', 0.02388623803691428), ('xmr withdrawal', 0.022905677166382882), ('lfw', 0.020975478723098375)]
Topic 35:
[('tails', 0.16608161332689814), ('tail', 0.13811881990583533), ('electrum', 0.12793371341884016), ('wallet', 0.060172586573322195), ('whonix', 0.043956636853453926), ('monero', 0.042820101505499926), ('usb', 0.038269012567511224), ('electrum wallet', 0.03266818304438283), ('electrum tail', 0.02751289116948836), ('monero wallet', 0.026256935911540154)]
Topic 36:
[('mushroom', 0.13494669849792765), ('mushrooms', 0.09224942455638298), ('shrooms', 0.08697247444291728), ('magic', 0.07173853017427768), ('cubensis', 0.059415494503169554), ('magic mushrooms', 0.04196185835332625), ('psilocybin', 0.040879833586613595), ('psilocybe', 0.03648147963849479), ('magic mushroom', 0.03622256754377185), ('penis', 0.03389227020845581)]
Topic 37:
[('dread', 0.2681730554127759), ('dread dread', 0.0434925406301928), ('cafe dread', 0.038289226826336564), ('cafe', 0.03771246613377383), ('dread word', 0.03726941736665281), ('word day', 0.037067233457539685), ('word', 0.03431554512407786), ('sub dread', 0.02638917753177937), ('sub', 0.021469881583509116), ('new dread', 0.01950150523536743)]
Topic 38:
[('cc', 0.19850827602610305), ('cvv', 0.12559480815359578), ('vbv', 0.05156445823606583), ('cc vendor', 0.0390492104991615), ('cc cvv', 0.03541724139457509), ('non', 0.03209423751593194), ('non vbv', 0.031689110721461924), ('ccv', 0.03142631440888132), ('cvv vendor', 0.030653897772347494), ('fullz', 0.02565185109174636)]
Topic 39:
[('cryptonia', 0.270593329016721), ('cryptonia market', 0.05372997670742348), ('cryptonia cryptonia', 0.04063200874811663), ('dcdutchconnectionuk', 0.02698588870900908), ('empire cryptonia', 0.025352048121288082), ('market', 0.021908509868778566), ('dutyfreesmoking', 0.021779896231356913), ('vendor cryptonia', 0.021426552039488206), ('nightmare', 0.019568159691726875), ('cryptonia new', 0.015018539633818486)]
Topic 40:
[('withdraw', 0.16777560333243718), ('withdrawal', 0.13214115961261508), ('withdrawl', 0.04517713695493818), ('withdraws', 0.030940211999519157), ('btc', 0.030198882248887494), ('withdraw btc', 0.029522871625856834), ('working', 0.029302858049487304), ('pin', 0.02777645879969811), ('issue', 0.027607420425603244), ('withdraw pin', 0.026471975299199244)]
Topic 41:
[('escrow', 0.22451268938457392), ('multisig', 0.08174648178119914), ('escrow escrow', 0.028342657225423777), ('full escrow', 0.022926390507942773), ('escrow order', 0.021256992919067835), ('extend', 0.020684030739848886), ('extend escrow', 0.020634859815608034), ('order escrow', 0.020634859815608034), ('escrow service', 0.019024811620750936), ('full', 0.016810907663589756)]
Topic 42:
[('heroin', 0.24667865086560822), ('heroin vendor', 0.043298418940449716), ('afghan', 0.0370261380542706), ('afghan heroin', 0.03321404939224221), ('synthetic heroin', 0.027180379676850733), ('best heroin', 0.024176709214531092), ('synthetic', 0.023744583992494864), ('heroin sale', 0.02343861004012714), ('heroin review', 0.023225789058935086), ('ww', 0.02051867713165788)]
Topic 43:
[('de', 0.04104874164468111), ('har', 0.03004281267703729), ('noen', 0.025615388883690592), ('som', 0.023644974354175934), ('fra', 0.02234525177783555), ('en', 0.021807143998736837), ('la', 0.019481414592274055), ('para', 0.018155202455363964), ('mi', 0.017936341798979228), ('som har', 0.017123720458272217)]
Topic 44:
[('dnm', 0.17395559883217143), ('dnms', 0.04555790479945865), ('dn', 0.04298869175969707), ('bible', 0.03480993898853253), ('dnstars', 0.026858365160836242), ('dnmuk', 0.024792337071541142), ('dm', 0.023808743501494478), ('avenger', 0.019833539648394766), ('dnm vendor', 0.0145781416058812), ('vendor bible', 0.014363474603523505)]
Topic 45:
[('wallstreet', 0.17075397467717646), ('wall', 0.12652076561807357), ('wall street', 0.07815865540024333), ('street', 0.0744307008536918), ('wall st', 0.058824930764469485), ('st', 0.05389895713137546), ('street market', 0.05050436093268345), ('wallstreet market', 0.047109434439433946), ('wallstreetmarket', 0.0411780639775008), ('market', 0.03414270461323302)]
Topic 46:
[('ddos', 0.3105265840768657), ('ddos attack', 0.11291435352309753), ('attack', 0.10930939685702705), ('ddos ddos', 0.028760472115733474), ('ddos attacks', 0.026872070458530385), ('attacks', 0.025462720293905332), ('attack ddos', 0.024955566700335146), ('market ddos', 0.02300839257477511), ('market', 0.020031741575660723), ('ddos market', 0.01694829797415259)]
Topic 47:
[('paypal', 0.2908803867616197), ('transfer', 0.11757620933383377), ('paypal transfer', 0.08045981107451718), ('paypal account', 0.05635507976369717), ('western union', 0.04434216793462146), ('western', 0.0436696030188777), ('union', 0.04260904354730236), ('account', 0.0397607705908628), ('transfer paypal', 0.03064364170976873), ('venmo', 0.030317733806543675)]
Topic 48:
[('heard', 0.07747679742838355), ('happened', 0.049591151053306165), ('anyone', 0.04910471727518297), ('anyone heard', 0.0484273484083465), ('thewizzardnl', 0.03331435228064015), ('anybody heard', 0.03167116165532856), ('has anyone', 0.03119052964382607), ('has', 0.029099336350054232), ('anybody', 0.028548771954404018), ('therealrc', 0.028190854068417655)]
Topic 49:
[('benzos', 0.1469965293325937), ('benzo', 0.14387882009188516), ('rc', 0.04859228899225342), ('benzo vendor', 0.039310366559657056), ('rc benzos', 0.038034750391445375), ('benzobananas', 0.028228816077837456), ('rc benzo', 0.02735900406635033), ('benzoboys', 0.02658197245182248), ('best benzos', 0.02540478949018245), ('vendor benzos', 0.01997789146191072)]
Topic 50:
[('fraud', 0.25672448642865076), ('fraudsters', 0.037980752363923176), ('fraud vendor', 0.034182677127530856), ('loan fraud', 0.02683887738250451), ('fraudfox', 0.0262534840019881), ('loan', 0.024909941198737545), ('fraud fraud', 0.021471115793857264), ('fraud forum', 0.01962254027592412), ('uk fraud', 0.017738943947790978), ('best fraud', 0.017442258023043663)]
Topic 51:
[('dream', 0.1256863407223506), ('dream vendor', 0.0729529170933915), ('dream market', 0.05965219005493259), ('vendor dream', 0.04823192566238449), ('vendor', 0.04446878236261653), ('market', 0.04074878970532253), ('vendor inquiry', 0.0273713869493084), ('inquiry', 0.02705502158368874), ('nightmare market', 0.02683006952691045), ('nightmare', 0.026809442585803697)]
Topic 52:
[('order', 0.12808537202217138), ('cancel', 0.1056355402458776), ('cancelled', 0.08816652155833675), ('refund', 0.06317930029458749), ('cancel order', 0.06064213753804928), ('canceled', 0.04733993827455691), ('cancelled order', 0.04708312251370501), ('order cancelled', 0.03478731701269228), ('refunded', 0.027871549314468867), ('auto', 0.025578461854749594)]
Topic 53:
[('bank', 0.25097424693158105), ('bank log', 0.06619702073400643), ('bank drop', 0.06323528667005968), ('log', 0.05747191271800869), ('bank account', 0.05180288067730549), ('drop', 0.05152932337508871), ('logs', 0.050620524412714064), ('bank logs', 0.04511971186214804), ('account', 0.041499352402375615), ('bank logins', 0.033855349762364984)]
Topic 54:
[('onion', 0.32006057669572174), ('onion site', 0.06649607625940172), ('site', 0.04428860772985556), ('onion link', 0.04208690156388304), ('onion list', 0.031916646862297816), ('link', 0.031239331798188157), ('list', 0.028761313034509493), ('onion address', 0.02732765760686286), ('onions', 0.02694578747755256), ('onion service', 0.025703448035517374)]
Topic 55:
[('phishing', 0.23561044910532333), ('phishing link', 0.06313802156628703), ('phished', 0.06309006404198617), ('link', 0.0544273135365213), ('warning', 0.046834055380374974), ('phishing site', 0.042547653995183336), ('phishing warning', 0.0359857895652629), ('site', 0.027930499527085186), ('warning phishing', 0.02550649584946054), ('attempt', 0.02386768528099481)]
Topic 56:
[('apollon', 0.2610557915169623), ('apollon market', 0.15266804701967007), ('market', 0.05451694610184281), ('apollon apollon', 0.04000921043636116), ('mysteryland', 0.03430376947112439), ('market apollon', 0.02572590386075179), ('vendor apollon', 0.022431011183581366), ('jerry', 0.02222860417547563), ('apollo', 0.02130530184604832), ('tom jerry', 0.0200948479637061)]
Topic 57:
[('opsec', 0.35022729224865795), ('opsec opsec', 0.046836666139475104), ('opsec question', 0.038854537729898934), ('bad opsec', 0.024478757307401466), ('question', 0.023721106194186832), ('good opsec', 0.023158241948069137), ('opsec guide', 0.022347138495029003), ('guide', 0.02053973286259763), ('dnm', 0.018032696761829944), ('help opsec', 0.01731744867947547)]
Topic 58:
[('link', 0.2907868689372496), ('working link', 0.1425857891330286), ('working', 0.12626414963429772), ('pm', 0.10998816783875198), ('link please', 0.07544814928265496), ('pm link', 0.06343413695872846), ('link working', 0.05576602338283193), ('please', 0.05553799256006957), ('pm working', 0.04196619637285951), ('please pm', 0.04165737257090193)]
Topic 59:
[('mirror', 0.37636469017813645), ('working mirror', 0.11472362758530133), ('working', 0.10713557979686175), ('mirror link', 0.07119835688788886), ('empire mirror', 0.06148918029810922), ('mirror working', 0.05358983328686946), ('mirror please', 0.05138990153126206), ('mirrors', 0.05043749015234773), ('link', 0.048633708091083104), ('pm', 0.045039742736056855)]
Topic 60:
[('fentanyl', 0.24680043197686974), ('fent', 0.10237006383161038), ('carfentanil', 0.03273064608297345), ('selling fentanyl', 0.028024619668411806), ('analogue', 0.026872645234550612), ('fentanyl vendor', 0.02315329870273171), ('fentanyl analogue', 0.022766057057637824), ('admits', 0.021163030156912815), ('fentanyl distribution', 0.02064331974556389), ('distribution', 0.019829358659107895)]
Topic 61:
[('cgmc', 0.28724192332245013), ('invite', 0.24126623833526956), ('invite code', 0.13131152093750256), ('code', 0.10682393111274423), ('cgmc invite', 0.08719794948943649), ('code cgmc', 0.028725764339705678), ('invite cgmc', 0.025151130113143), ('cgmc cgmc', 0.021480773490797567), ('registration', 0.017751806113272), ('cgmc open', 0.017695285128428256)]
Topic 62:
[('alprazolam', 0.1774950431253047), ('powder', 0.0903159297884282), ('alprazolam powder', 0.0683344750163608), ('flualprazolam', 0.04816050372458763), ('etizolam', 0.03744089718480313), ('flubromazolam', 0.03348945359221391), ('mg', 0.0314066846132662), ('diclazepam', 0.031003866363976132), ('clonazolam', 0.029651945399140792), ('etizolam powder', 0.02546982532718339)]
Topic 63:
[('dmt', 0.3622943812322028), ('dmt vendor', 0.08894662727116771), ('dmt vape', 0.04150662526223707), ('odsmt', 0.03831447187988808), ('dmt dmt', 0.03553878477512212), ('changa', 0.032613763332761496), ('bluefairy', 0.031201379299106255), ('looking dmt', 0.029361329758132906), ('dmt changa', 0.027337509067753882), ('shimshai', 0.02647236711639086)]
Topic 64:
[('captcha', 0.2564820546479972), ('rapture', 0.20670440179803615), ('rapture market', 0.058817891966963326), ('captcha captcha', 0.026389804388985475), ('incorrect', 0.023606308505803694), ('login', 0.023258878731221535), ('dread captcha', 0.022538685458512522), ('link', 0.021828131299706968), ('main', 0.021274078161211345), ('main link', 0.020481763901360785)]
Topic 65:
[('chemical', 0.12788227993451434), ('research', 0.10544064542899824), ('research chemical', 0.07562371216638425), ('chems', 0.06505695446372571), ('research chemicals', 0.04572949815260401), ('chemicals', 0.045233832083363984), ('chem', 0.03560638093549809), ('chemgenie', 0.03429712361445301), ('chemical vendor', 0.0317058058936168), ('chemist', 0.02535261421452179)]
Topic 66:
[('tor', 0.28796745713292526), ('tor browser', 0.07657915395648766), ('browser', 0.07489476479942064), ('tor network', 0.03000705217019978), ('network', 0.02939376833861387), ('tor tor', 0.026505883480420277), ('tor project', 0.022309921193224436), ('research', 0.022046155749248675), ('vpn', 0.018954071236719917), ('project', 0.018945142159026828)]
Topic 67:
[('mephedrone', 0.15930437837531358), ('meopcp', 0.1259770106160847), ('mxe', 0.09446545586467191), ('mescaline', 0.07779051848670795), ('mmc', 0.05270377202214209), ('mephedrone mmc', 0.049619701685294317), ('amt', 0.04403369509350668), ('meodmt', 0.03627605355094202), ('mephedrone vendor', 0.03200220270040846), ('meow', 0.03132904607312401)]
Topic -1:
[('anyone', 0.009157486850034612), ('get', 0.005884162902552367), ('update', 0.005625579875985257), ('review', 0.005597399142755926), ('new', 0.005416412975120537), ('order', 0.005209978694101062), ('account', 0.005047053236402527), ('address', 0.004937169330719495), ('uk', 0.004731322461951784), ('free', 0.004693989597580415)]
topic_model.visualize_topics()
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(new_topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(new_topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.48274117708206177 Davies_bouldin_score: 0.8687026513069406
topic_words = topic_model.get_topics()
topics_ll = [topic_words[i] for i in range(len(topic_words) - 1) if i != -1]
topn = 10
topic_list = []
for topic in topics_ll:
topic_list.append([word for word, _ in topic[:topn]])
coherence_model = CoherenceModel(
topics=topic_list,
texts=[doc.split() for doc in tc1.corpus],
dictionary=corpora.Dictionary([doc.split() for doc in tc1.corpus]),
coherence='c_v'
)
print(f"Coherence Model: {coherence_model.get_coherence()}")
Coherence Model: 0.5645685188607535
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
zero_shot_topics = pd.read_csv('../../../intent_crime.csv')['intent'].tolist()
dict_zero_shots_25 = ppt.assign_labels_to_topics(classifier, topic_model, zero_shot_topics, len(set(new_topics) - {-1}), threshold=.25)
Assigning labels to topics: 0%| | 0/68 [00:00<?, ?it/s]
Assigning labels to topics: 100%|██████████| 68/68 [42:46<00:00, 37.74s/it]
dict_zero_shots_25 = pd.read_csv('ZeroShotClassificationResults/all-MiniLM-L6-v2_150_20n/zero_shot_025.csv').set_index('Topic')['Labels'].to_dict()
dict_zero_shots_25[1] = 'ask help - ask help post'
dict_zero_shots_25[2] = 'order'
dict_zero_shots_25[10] = 'crosspost vendor'
dict_zero_shots_25[13] = 'dream market - dread'
dict_zero_shots_25[14] = 'ask help - moderator'
dict_zero_shots_25[15] = 'cocaine vendor - cocaine'
dict_zero_shots_25[20] = 'bunk bar'
dict_zero_shots_25[28] = 'wsm vendor - wsm market'
dict_zero_shots_25[39] = 'cryptonia market - dread'
dict_zero_shots_25[41] = 'escrow service'
dict_zero_shots_25[48] = 'event happened'
dict_zero_shots_25[51] = 'dream - dream vendor - dread'
dict_zero_shots_25[52] = 'order cancelled'
dict_zero_shots_25[57] = 'opsec questions'
dict_zero_shots_25[58] = 'link'
dict_zero_shots_25[59] = 'mirror - mirror link - working mirror'
dict_zero_shots_25[64] = 'capcha'
pd.DataFrame(list(dict_zero_shots_25.items()), columns=['Topic', 'Labels']).to_csv('ZeroShotClassificationResults/all-MiniLM-L6-v2_150_20n/zero_shot_025.csv', index=False)
topic_model.set_topic_labels(dict_zero_shots_25)
reduced_embeddings = UMAP(n_neighbors=15, n_components=2,
min_dist=0.0, metric='cosine').fit_transform(tc1.corpus_embeddings)
topic_model.visualize_documents(tc1.corpus, reduced_embeddings=reduced_embeddings,
hide_document_hover=True, hide_annotations=True, custom_labels=True)
topic_model.visualize_barchart(top_n_topics=70, custom_labels=True, n_words=10)
topic_model.visualize_hierarchy(custom_labels=True)
topic_model.visualize_heatmap(custom_labels=True)
topics_over_time = topic_model.topics_over_time(tc1.corpus, created_on,
global_tuning=True, evolution_tuning=True, nr_bins=100)
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10, width=1250, height=700, custom_labels=True)
15it [00:22, 1.52s/it]
df['name_thread'] = df['name_thread'].str.lower().dropna()
df.drop_duplicates(subset='name_thread', inplace=True)
df.dropna(subset=['name_thread'], inplace=True)
created_on = df['created_on'].tolist()
indices = [index for index, topic in enumerate(new_topics) if topic != -1]
corpus_valid = [tc1.corpus[i] for i in indices]
created_on_valid = [created_on[i] for i in indices]
embeddings_valid = [tc1.corpus_embeddings[i] for i in indices]
topics_valid = [new_topics[i] for i in indices]
probs_valid = [probs[i] for i in indices]
results = pd.DataFrame({
'Document': corpus_valid,
'Embedding': embeddings_valid,
'Topic': topics_valid,
'Probability': probs_valid,
'Created_on': created_on_valid,
})
results_final = pd.merge(results, topic_model.get_topic_info(), on='Topic')
print(results_final.shape)
results_final.head()
(46629, 10)
| Document | Embedding | Topic | Probability | Created_on | Count | Name | CustomName | Representation | Representative_Docs | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | naturalmeds apollon | [-0.11246682, -0.03748099, 0.04490077, 0.01724... | 56 | [1.1964202652021368e-307, 6.589650224023648e-3... | 2020-01-09 | 242 | 56_apollon_apollon market_market_apollon apollon | market - apollon - apollon market | [apollon, apollon market, market, apollon apol... | [back apollon, apollon, apollon market] |
| 1 | redemption btc giveaway coke sub dread member ... | [-0.043700494, -0.032600075, 0.0051953266, 0.0... | 24 | [0.006596099708564405, 0.003763170646085399, 0... | 2020-01-09 | 544 | 24_coke_coke vendor_best coke_uk coke | cocaine | [coke, coke vendor, best coke, uk coke, uk, go... | [fire coke, coke vendor , coke vendor] |
| 2 | flubromazolam sample giveaway | [-0.04101017, 0.007629349, -0.07528322, -0.033... | 62 | [8.335712654832696e-308, 7.430095287264378e-30... | 2019-11-06 | 290 | 62_alprazolam_powder_alprazolam powder_flualpr... | powder - alprazolam - alprazolam powder | [alprazolam, powder, alprazolam powder, flualp... | [usa domestic alprazolam powder mxe apvp inbom... |
| 3 | cigarette tobacco replica | [-0.07527819, 0.13146353, -0.07912154, -0.0353... | 0 | [0.040079176553298505, 0.007976173889767435, 0... | 2020-01-09 | 5021 | 0_weed_cannabis_cart_thc | marijuana | [weed, cannabis, cart, thc, review, hash, shat... | [product vendor review ml lemon kush wax vape ... |
| 4 | requiring image image review | [-0.014864997, 0.08211257, 0.004136639, 0.0027... | 16 | [0.006753134223967767, 0.007309909359113744, 0... | 2019-11-06 | 998 | 16_review_vendor review_review vendor_vendor | reviews vendor | [review, vendor review, review vendor, vendor,... | [xpost danknation vendor review sunaero multis... |
results_final.to_parquet('ResultsBERTopic/BERTopic_all-MiniLM-L6-v2_150_20n.parquet')
topic_model.save("Models/topic_model_all-MiniLM-L6-v2_150_20n", serialization="pickle", save_ctfidf=True, save_embedding_model=model)
topic_model.save("Models/topic_model_all-MiniLM-L6-v2_150_20n_safetensors", serialization="safetensors", save_ctfidf=True, save_embedding_model=model)
topic_model = BERTopic.load("Models/topic_model_all-MiniLM-L6-v2_150_20n")
sentence = ['recently closed Samsara market']
tp, pr = topic_model.transform(sentence)
Batches: 100%|██████████| 1/1 [00:00<00:00, 2.00it/s] 2024-06-30 21:54:03,236 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings. 2024-06-30 21:54:08,543 - BERTopic - Dimensionality - Completed ✓ 2024-06-30 21:54:08,544 - BERTopic - Clustering - Approximating new points with `hdbscan_model` 2024-06-30 21:54:08,576 - BERTopic - Probabilities - Start calculation of probabilities with HDBSCAN 2024-06-30 21:54:08,699 - BERTopic - Probabilities - Completed ✓ 2024-06-30 21:54:08,701 - BERTopic - Cluster - Completed ✓
top_indices = np.argsort(pr[0])[::-1][:5]
top_topics = [(topic_model.get_topic(i), pr[0][i], topic_model.custom_labels_[i+1]) for i in top_indices]
df_finals = pd.DataFrame(top_topics, columns=['Topic', 'Probability', 'Label'])
df_finals['Words'] = df_finals['Topic'].apply(lambda topic: [word for word, prob in topic])
df_finals['Sentence'] = sentence * len(df_finals)
df_finals
| Topic | Probability | Label | Words | Sentence | |
|---|---|---|---|---|---|
| 0 | [(cryptonia, 0.270593329016721), (cryptonia ma... | 0.002991 | cryptonia market - dread | [cryptonia, cryptonia market, cryptonia crypto... | recently closed Samsara market |
| 1 | [(empire, 0.1408905571475779), (empire market,... | 0.002262 | empire market | [empire, empire market, empire empire, market,... | recently closed Samsara market |
| 2 | [(wallstreet, 0.17075397467717646), (wall, 0.1... | 0.002127 | wallstreet | [wallstreet, wall, wall street, street, wall s... | recently closed Samsara market |
| 3 | [(scammer, 0.09029491748260987), (scam, 0.0879... | 0.002075 | fraud - scammer | [scammer, scam, exit, scamming, scammed, exit ... | recently closed Samsara market |
| 4 | [(crosspost, 0.14470709618860572), (giveaway, ... | 0.002045 | crosspost vendor | [crosspost, giveaway, review crosspost, crossp... | recently closed Samsara market |
topic_model.get_topic(tp[0])
[('anyone', 0.009415205712082564),
('update', 0.007916840604830654),
('address', 0.006939392479966835),
('new', 0.006268540212828576),
('get', 0.006179372827051399),
('vacation', 0.0059803996923821),
('has', 0.0058347636749867746),
('need', 0.005401699096715211),
('drop', 0.005394598005495695),
('listing', 0.005367223382048893)]
OTHER - No Preprocessing¶
df = pd.read_csv('../merged_data.csv')
df = df.dropna(subset=['name_thread'])
df = df.drop_duplicates(subset=['name_thread'], keep='first')
df.shape[0]
68931
df.head()
| name_board | creator_thread | creator_id_thread | name_thread | created_on | registration_date | total_posts | reputation | creator_post | creator_id_post | content | created_on_post | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Tor | blonger | 32544 | Tor shrinked of about 13,5% but you can contri... | 2020-01-09 | 1900-01-01 00:00:00 | 0.0 | 0.0 | [deleted] | 37 | [removed] | 2019-10-16 |
| 2 | Tor | Xanitforthecash | 2467 | should I run a tor node | 2020-01-09 | 2018-03-21 00:00:00 | 88.0 | 9.0 | rswz | 32661 | ***LINK***http://ea5faa5po25cf7fb.onion/projec... | 2019-10-16 |
| 7 | Tor | Syndicate | 33036 | Tor Browser 9.0! | 2019-11-06 | 1900-01-01 00:00:00 | 0.0 | 0.0 | Syndicate | 33036 | Tor Browser 9.0 is now available from the Tor ... | 2019-10-30 |
| 9 | Tor | Syndicate | 33036 | [UPDATED] Tor Security Guide Crosspost | 2019-11-06 | 1900-01-01 00:00:00 | 0.0 | 0.0 | Syndicate | 33036 | ***LINK***[UPDATED] Tor Security Guide[/post/9... | 2019-10-30 |
| 10 | Tor | wekhiu48 | 1178 | /u/CostcoRotisserieChicken spreading FUD, Let'... | 2020-01-09 | 2019-05-24 00:00:00 | 1355.0 | 342.0 | wekhiu48 | 1178 | ***LINK***/u/CostcoRotisserieChicken spreading... | 2019-10-30 |
model = SentenceTransformer('all-MiniLM-L6-v2')
tc1 = ppt.TextClustering(df, 'name_thread')
tc1.encode_corpus(model, batch_size=64, to_tensor=False)
2024-07-01 16:22:21,480 - PreProcessingText - INFO - Encoding the corpus. This might take a while. Batches: 100%|██████████| 1065/1065 [10:23<00:00, 1.71it/s]
array([[-2.8301043e-02, 1.2166312e-02, 8.7255865e-02, ...,
-8.3833829e-02, -9.1334045e-02, -5.1197205e-02],
[-4.2577364e-02, 2.9390754e-02, 4.8442027e-03, ...,
-1.0984470e-01, -1.3457792e-02, 6.7995749e-02],
[ 2.0266762e-02, 4.9410637e-02, -9.2650507e-06, ...,
-6.9691852e-02, 5.9254151e-03, -2.0927912e-02],
...,
[-5.2471079e-02, -7.9523809e-02, -2.1687793e-02, ...,
-1.5866488e-02, -5.3970262e-02, 2.4720646e-02],
[-3.0821422e-02, -3.0247206e-02, -1.0898691e-02, ...,
-4.8142254e-02, 9.6911322e-03, -6.1955258e-02],
[ 2.7433981e-03, -8.2745239e-02, -3.2834979e-03, ...,
-4.6796635e-02, 6.8392135e-02, -3.5423629e-02]], dtype=float32)
mmr = MaximalMarginalRelevance(diversity=0.3)
kw = KeyBERTInspired()
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))
pos_model = PartOfSpeech("en_core_web_sm")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
umap_model = UMAP(n_neighbors=20, n_components=10, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=150, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
topic_model = BERTopic(
top_n_words=10,
n_gram_range=(1, 2),
umap_model=umap_model,
hdbscan_model=hdbscan_model,
vectorizer_model=vectorizer_model,
ctfidf_model=ctfidf_model,
representation_model={
"KeyBERT": kw,
"MMR": mmr,
"POS": pos_model
},
embedding_model=model,
calculate_probabilities=True,
verbose=True
)
topics, probs = topic_model.fit_transform(tc1.corpus, tc1.corpus_embeddings)
2024-07-01 17:56:59,017 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm 2024-07-01 17:59:01,118 - BERTopic - Dimensionality - Completed ✓ 2024-07-01 17:59:01,139 - BERTopic - Cluster - Start clustering the reduced embeddings 2024-07-01 17:59:52,141 - BERTopic - Cluster - Completed ✓ 2024-07-01 17:59:52,287 - BERTopic - Representation - Extracting topics from clusters using representation models. 2024-07-01 18:00:27,018 - BERTopic - Representation - Completed ✓
topic_model.get_topic_info()
| Topic | Count | Name | Representation | KeyBERT | MMR | POS | Representative_Docs | |
|---|---|---|---|---|---|---|---|---|
| 0 | -1 | 24206 | -1_etizolam_fullz_berlusconi_cdnven | [etizolam, fullz, berlusconi, cdnven, pin, dro... | [customs, ukbk, cdnven, heinekenexpress, produ... | [etizolam, fullz, berlusconi, cdnven, pin, dro... | [fullz, berlusconi, cdnven, drop, tochka, bond... | [pill presses - chemicals - amphetamine salts ... |
| 1 | 0 | 5970 | 0_weed_carts_thc_cannabis | [weed, carts, thc, cannabis, distillate, shatt... | [bulk weed, cannabis, uk weed, edibles, weed v... | [weed, carts, thc, cannabis, distillate, shatt... | [weed, carts, thc, cannabis, distillate, shatt... | [1500 mg thc gummies - 60 pieces x 25mg each -... |
| 2 | 1 | 2675 | 1_sub_noob_hello_subs | [sub, noob, hello, subs, guys, thank, communit... | [mentor, guides, newbie, happy new, guide, com... | [sub, noob, hello, subs, guys, thank, communit... | [sub, noob, subs, guys, community, help, notif... | [i dont know what to do my life sucks now all ... |
| 3 | 2 | 1919 | 2_empire_empire empire_empire market_market em... | [empire, empire empire, empire market, market ... | [empire market, empire markets, market empire,... | [empire, empire empire, empire market, market ... | [empire, mods, link, market, dispute, account,... | [empire market is back!, zaguble is now on thi... |
| 4 | 3 | 1780 | 3_dream_dream market_nightmare_dream dream | [dream, dream market, nightmare, dream dream, ... | [dream market, market dream, nightmare market,... | [dream, dream market, nightmare, dream dream, ... | [dream, nightmare, nightmare market, dreammark... | [why is dream market still up?, dream market i... |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 69 | 68 | 173 | 68_chemicals_research chemicals_research_chems | [chemicals, research chemicals, research, chem... | [research chemicals, research chemical, resear... | [chemicals, research chemicals, research, chem... | [chemicals, research, chems, chemical, chemist... | [research chemicals?, research chemicals, chem... |
| 70 | 69 | 162 | 69_xanmasterfrank_xans_xan_xmf | [xanmasterfrank, xans, xan, xmf, xansalad, xan... | [mailman xans, xan vendor, new xan, xans, xan,... | [xanmasterfrank, xans, xan, xmf, xansalad, xan... | [xans, xansalad, game, demand, savior, rolling... | [[notice] c10labs & mailman-xans vendor update... |
| 71 | 70 | 159 | 70_2c_2cb_domestic 2c_2cb vendors | [2c, 2cb, domestic 2c, 2cb vendors, cb, 2cb ve... | [2c available, 2c vendors, 2c, looking 2c, 2c ... | [2c, 2cb, domestic 2c, 2cb vendors, cb, 2cb ve... | [2cb, 2cb vendor, good 2cb, domestic, ausorgan... | [naghb!!! new batch nice and white, super dry!... |
| 72 | 71 | 156 | 71_reviews_reviewer_reviews review_review theb... | [reviews, reviewer, reviews review, review the... | [reviews, reviews review, reviews trusted, rev... | [reviews, reviewer, reviews review, review the... | [reviews, reviewer, thebotanist, rcexpress, ho... | [need good reviews by trusted people, no vouch... |
| 73 | 72 | 150 | 72_samples_free samples_samples new_samples free | [samples, free samples, samples new, samples f... | [free samples, samples free, samples uk, free ... | [samples, free samples, samples new, samples f... | [samples, free samples, free, kwayuk, amphetam... | [new vendor - free samples :), free samples ||... |
74 rows × 8 columns
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 5))
topic_model.update_topics(tc1.corpus, vectorizer_model=vectorizer_model)
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.6635172367095947 Davies_bouldin_score: 0.41709404604850947
topic_words = topic_model.get_topics()
topics_ll = [topic_words[i] for i in range(len(topic_words) - 1) if i != -1]
topn = 10
topic_list = []
for topic in topics_ll:
topic_list.append([word for word, _ in topic[:topn]])
coherence_model = CoherenceModel(
topics=topic_list,
texts=[doc.split() for doc in tc1.corpus],
dictionary=corpora.Dictionary([doc.split() for doc in tc1.corpus]),
coherence='c_v'
)
print(f"Coherence Model: {coherence_model.get_coherence()}")
Coherence Model: 0.4770898279015756
topic_model.visualize_topics()
new_topics = topic_model.reduce_outliers(tc1.corpus, topics, strategy="embeddings", embeddings=tc1.corpus_embeddings, threshold=0.43)
topic_model.update_topics(tc1.corpus, topics=new_topics)
topic_model.get_topic_info()
2024-07-01 18:19:47,256 - BERTopic - WARNING: Using a custom list of topic assignments may lead to errors if topic reduction techniques are used afterwards. Make sure that manually assigning topics is the last step in the pipeline.Note that topic embeddings will also be created through weightedc-TF-IDF embeddings instead of centroid embeddings.
| Topic | Count | Name | Representation | KeyBERT | MMR | POS | Representative_Docs | |
|---|---|---|---|---|---|---|---|---|
| 0 | -1 | 16327 | -1_to_the_is_for | [to, the, is, for, anyone, with, how, and, in,... | [customs, ukbk, cdnven, heinekenexpress, produ... | [etizolam, fullz, berlusconi, cdnven, pin, dro... | [fullz, berlusconi, cdnven, drop, tochka, bond... | [pill presses - chemicals - amphetamine salts ... |
| 1 | 0 | 5984 | 0_weed_carts_thc_cannabis | [weed, carts, thc, cannabis, review, distillat... | [bulk weed, cannabis, uk weed, edibles, weed v... | [weed, carts, thc, cannabis, distillate, shatt... | [weed, carts, thc, cannabis, distillate, shatt... | [1500 mg thc gummies - 60 pieces x 25mg each -... |
| 2 | 1 | 2680 | 1_this_help_you_the | [this, help, you, the, what, to, need, we, me,... | [mentor, guides, newbie, happy new, guide, com... | [sub, noob, hello, subs, guys, thank, communit... | [sub, noob, subs, guys, community, help, notif... | [i dont know what to do my life sucks now all ... |
| 3 | 2 | 1995 | 2_empire_on empire_empire market_on | [empire, on empire, empire market, on, empire ... | [empire market, empire markets, market empire,... | [empire, empire empire, empire market, market ... | [empire, mods, link, market, dispute, account,... | [empire market is back!, zaguble is now on thi... |
| 4 | 3 | 1851 | 3_dream_nightmare_on dream_dream market | [dream, nightmare, on dream, dream market, mar... | [dream market, market dream, nightmare market,... | [dream, dream market, nightmare, dream dream, ... | [dream, nightmare, nightmare market, dreammark... | [why is dream market still up?, dream market i... |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 69 | 68 | 196 | 68_chemicals_research_research chemicals_chems | [chemicals, research, research chemicals, chem... | [research chemicals, research chemical, resear... | [chemicals, research chemicals, research, chem... | [chemicals, research, chems, chemical, chemist... | [research chemicals?, research chemicals, chem... |
| 70 | 69 | 274 | 69_xtc_xmf_xans_xan | [xtc, xmf, xans, xan, xanmasterfrank, xanos, x... | [mailman xans, xan vendor, new xan, xans, xan,... | [xanmasterfrank, xans, xan, xmf, xansalad, xan... | [xans, xansalad, game, demand, savior, rolling... | [[notice] c10labs & mailman-xans vendor update... |
| 71 | 70 | 201 | 70_2c_2cb_cb_2cb vendors | [2c, 2cb, cb, 2cb vendors, for 2c, for 2cb, 2c... | [2c available, 2c vendors, 2c, looking 2c, 2c ... | [2c, 2cb, domestic 2c, 2cb vendors, cb, 2cb ve... | [2cb, 2cb vendor, good 2cb, domestic, ausorgan... | [naghb!!! new batch nice and white, super dry!... |
| 72 | 71 | 453 | 71_review_reviews_review review_feedback | [review, reviews, review review, feedback, kar... | [reviews, reviews review, reviews trusted, rev... | [reviews, reviewer, reviews review, review the... | [reviews, reviewer, thebotanist, rcexpress, ho... | [need good reviews by trusted people, no vouch... |
| 73 | 72 | 238 | 72_samples_free samples_free_sample | [samples, free samples, free, sample, samples ... | [free samples, samples free, samples uk, free ... | [samples, free samples, samples new, samples f... | [samples, free samples, free, kwayuk, amphetam... | [new vendor - free samples :), free samples ||... |
74 rows × 8 columns
for topic_id in set(topics):
print(f"Topic {topic_id}:")
print(topic_model.get_topic(topic_id))
Topic 0:
[('weed', 0.026121802166754134), ('carts', 0.017488248895024136), ('thc', 0.016026211658409277), ('cannabis', 0.01577853469147459), ('review', 0.013686158893156207), ('distillate', 0.010819478752233087), ('hash', 0.010250132075366852), ('shatter', 0.010090807113538146), ('indoor', 0.009561748524550818), ('vape', 0.009464869665684333)]
Topic 1:
[('this', 0.022156612826095497), ('help', 0.01632414770533675), ('you', 0.014978321013280858), ('the', 0.012811440985770573), ('what', 0.012388997306343949), ('to', 0.012100264616063165), ('need', 0.01109007497215625), ('we', 0.0107600044873439), ('me', 0.009563915734872003), ('here', 0.009511330630291633)]
Topic 2:
[('empire', 0.10770406850965517), ('on empire', 0.04816961138770311), ('empire market', 0.0328421796934902), ('on', 0.022504719462360295), ('empire empire', 0.020009257170742853), ('market', 0.017631311042885157), ('empire is', 0.015589458155944999), ('is', 0.013414536942523092), ('to empire', 0.011324614016653523), ('is empire', 0.010859945457812397)]
Topic 3:
[('dream', 0.09641002602673354), ('nightmare', 0.04913856746609453), ('on dream', 0.047042687321489224), ('dream market', 0.03765107394642499), ('market', 0.028699181277440124), ('on', 0.025210226979058968), ('nightmare market', 0.018702413637631816), ('on nightmare', 0.01868877995580611), ('from dream', 0.018357467692169397), ('dream dream', 0.013163321597797439)]
Topic 4:
[('vendor', 0.052700029031768666), ('vendors', 0.04324652820637855), ('for', 0.01393399811834629), ('new vendor', 0.012932303425000725), ('looking', 0.012816596770230374), ('new', 0.012787313927470147), ('inquiry', 0.012680585032289849), ('vendor inquiry', 0.012597115343610051), ('looking for', 0.012564980334643528), ('for vendor', 0.011565740630903716)]
Topic 5:
[('scammer', 0.06557991898322732), ('scam', 0.062423182493869016), ('scamming', 0.0416026763582103), ('exit', 0.03962120344428672), ('scammed', 0.033278335068797134), ('legit', 0.026933766650158217), ('exit scam', 0.024854726513092378), ('is', 0.024516339837274866), ('is scammer', 0.022010863309222133), ('selective', 0.020247491591059354)]
Topic 6:
[('mdma', 0.12356226179434289), ('mda', 0.033837876024986675), ('mdma vendor', 0.021887930572066675), ('us', 0.01984273171377282), ('domestic', 0.015445618836374264), ('only', 0.014518200241130881), ('mdma review', 0.01350972889234908), ('usa', 0.013353540656566994), ('best mdma', 0.013039974920400056), ('for mdma', 0.01299316496196008)]
Topic 7:
[('darknet', 0.05747348912936089), ('dark', 0.05109485353135456), ('dark web', 0.03548081747093856), ('web', 0.0349281975945472), ('sentenced', 0.021720947221901443), ('the darknet', 0.017948854650965712), ('sentenced to', 0.017705595191481), ('drug', 0.017108539242676322), ('darkweb', 0.016965668888090583), ('the dark', 0.01635221966708456)]
Topic 8:
[('xanax', 0.13265126906538416), ('bars', 0.03233130571500381), ('xanax bars', 0.0275726855493186), ('2mg', 0.022540800070776374), ('xanax vendor', 0.018903558996081354), ('3mg', 0.0171280849541117), ('best xanax', 0.01274858148476675), ('xanaxdaddy', 0.012584567449035392), ('review', 0.012191854584679848), ('xanax and', 0.012161589874413053)]
Topic 9:
[('lsd', 0.1500541317544297), ('tabs', 0.03700257669240534), ('lsd vendor', 0.028352613109866642), ('gammagoblin', 0.01654793640468815), ('lsd tabs', 0.016497458693354932), ('free', 0.01600863216095035), ('usa', 0.015898146825191358), ('mdma', 0.014025313395207913), ('lsd and', 0.013043958505000241), ('lsd review', 0.012766233200526075)]
Topic 10:
[('carding', 0.1110495291929295), ('card', 0.06721751572964317), ('cards', 0.06601786278324696), ('credit', 0.029322000678320657), ('gift', 0.02302338008431706), ('debit', 0.019371118488606325), ('credit card', 0.01821021474864727), ('carder', 0.01626656387439844), ('carded', 0.01619506300655433), ('prepaid', 0.01577363769638637)]
Topic 11:
[('pgp', 0.13587212423768744), ('key', 0.0798850048816273), ('pgp key', 0.06858432902502162), ('2fa', 0.05618980658572741), ('public', 0.020958928563643633), ('keys', 0.019911531140275458), ('public pgp', 0.018319258469666012), ('encrypt', 0.018078668999193033), ('my', 0.015908248312017165), ('encryption', 0.015567742749305113)]
Topic 12:
[('market', 0.08551283440316028), ('markets', 0.045749059635866764), ('the market', 0.019094970124643295), ('this market', 0.017914991116499124), ('new market', 0.017627390353052007), ('market is', 0.017152331153567645), ('marketplace', 0.016173780267312046), ('the', 0.016093209921969998), ('what', 0.015302905089668325), ('is', 0.013757711975997023)]
Topic 13:
[('shipping', 0.06707952945625859), ('delivery', 0.03739198906886787), ('package', 0.0335807962526959), ('shipped', 0.021681308295394053), ('usps', 0.01928404468808869), ('to', 0.017620772243724696), ('international', 0.017275062354480578), ('delivered', 0.01699843444922411), ('shipping to', 0.016221752036348076), ('ship', 0.014560830277983484)]
Topic 14:
[('deposit', 0.1264517942094552), ('deposits', 0.03923223907665865), ('btc', 0.03729918660678976), ('deposited', 0.035671739831721616), ('address', 0.035130090808307056), ('ticket', 0.029276657832733487), ('double', 0.02676244272911281), ('double deposit', 0.024892547344915906), ('missing', 0.024171735143637723), ('not', 0.02167293853453858)]
Topic 15:
[('cocaine', 0.16558700571468687), ('fishscale', 0.025497383031868166), ('peruvian', 0.024666728874358633), ('cocaine vendor', 0.023603150801550694), ('fishscale cocaine', 0.021690075192145053), ('uncut', 0.021180531857615013), ('90', 0.019718351568327092), ('colombian', 0.01883387407701839), ('pure', 0.018772086249928766), ('ukwhite', 0.018187008764506282)]
Topic 16:
[('dispute', 0.18107255614430504), ('disputes', 0.03870186099223393), ('dispute dispute', 0.027198209942153568), ('please', 0.02385847984002258), ('mod', 0.020718806954577403), ('days', 0.019586736957386802), ('resolved', 0.019556720533625234), ('help', 0.01836971378875693), ('no', 0.018175031416175165), ('my dispute', 0.017926092916419396)]
Topic 17:
[('job', 0.053617966900222684), ('lfw', 0.049012324715556436), ('hacking', 0.032951530510461076), ('hacked', 0.03104745784243176), ('hacker', 0.030683441109691355), ('email', 0.02559051646261674), ('malware', 0.017334036909751067), ('btc', 0.016687285255972538), ('for', 0.015662969796092323), ('lfw btc', 0.014489022479059529)]
Topic 18:
[('monero', 0.0840300999786981), ('bitcoin', 0.054101446350999234), ('btc', 0.04376407780260462), ('wallet', 0.02102890172499106), ('buy', 0.020963394878476746), ('crypto', 0.02012981815428904), ('buying', 0.020066985939152047), ('to', 0.016582297731664773), ('to buy', 0.016291413951388542), ('coins', 0.015919374208244234)]
Topic 19:
[('id', 0.10997763982427408), ('fake', 0.07081625600743174), ('fake id', 0.06032720035661417), ('license', 0.04806559029900924), ('passport', 0.04803835906156708), ('passports', 0.03248988196610057), ('scans', 0.0316159900336969), ('drivers', 0.030886177518274296), ('ids', 0.029724785628560103), ('drivers license', 0.02478697220409289)]
Topic 20:
[('anyone', 0.07403115183970083), ('happened', 0.04158765883819754), ('has anyone', 0.040647945759817825), ('happened to', 0.04036933800330011), ('what happened', 0.03677703377857992), ('heard', 0.03664614180565851), ('tried', 0.03569834479856244), ('has', 0.03525422736021841), ('anyone tried', 0.03432681011844722), ('anyone heard', 0.030885389950925424)]
Topic 21:
[('oxycodone', 0.06628105916641315), ('oxy', 0.050914576261560554), ('opiateconnect', 0.029309391867125948), ('80mg', 0.025514415664952925), ('opiates', 0.024522116459384753), ('morphine', 0.023407474802139994), ('dilaudid', 0.02230935589510508), ('10mg', 0.022182979515936393), ('oxycontin', 0.020267064194940176), ('40mg', 0.018017473812191667)]
Topic 22:
[('meth', 0.1574163112434691), ('amphetamine', 0.049300409593825305), ('crystal meth', 0.038976415801772385), ('crystal', 0.036505916304162125), ('meth vendor', 0.02617692499953047), ('methamphetamine', 0.025396642831782397), ('best meth', 0.018245108818923387), ('meth vendors', 0.017370472966145134), ('speed', 0.015739908907543582), ('meth and', 0.014542599402273156)]
Topic 23:
[('counterfeit', 0.12871490570155777), ('notes', 0.06547435507955257), ('euro', 0.0491725502130328), ('money', 0.03141880581362891), ('fake', 0.029951479845183634), ('gbp', 0.029799843552008314), ('counterfeits', 0.029192110618025423), ('counterfeit money', 0.027394824167863107), ('bills', 0.026267525876960095), ('counterfeit euro', 0.0236297292596767)]
Topic 24:
[('dread', 0.15698287206715708), ('on dread', 0.0327391257020154), ('hugbunter', 0.03200970784564761), ('dread is', 0.029548523962026743), ('cafe dread', 0.022703209273806108), ('cafe', 0.022384956404216023), ('word of', 0.021845632483744725), ('dread word', 0.021620621848201585), ('the day', 0.02132354776547619), ('word', 0.01876864260879318)]
Topic 25:
[('ticket', 0.16201131673866934), ('support ticket', 0.09713196568034925), ('support', 0.09643044005038842), ('tickets', 0.041286863293046976), ('please', 0.034380750920094336), ('help', 0.024471719016428438), ('response', 0.01967290184393878), ('se7en', 0.019158710405234706), ('please help', 0.018414086886693087), ('support tickets', 0.017575439859763984)]
Topic 26:
[('onion', 0.11803562686306104), ('tor', 0.0984618899334574), ('browser', 0.025339074667152597), ('tor browser', 0.024049148739020237), ('onion site', 0.021392123557307526), ('onions', 0.018010212171229376), ('on tor', 0.01790931186994719), ('site', 0.017102240366678303), ('v3', 0.012676813959885938), ('the onion', 0.01246732698095282)]
Topic 27:
[('tails', 0.183531729136247), ('electrum', 0.08784236176016401), ('on tails', 0.0697525949099992), ('wallet', 0.03877641166224935), ('whonix', 0.028542381893042725), ('monero', 0.028057319113386218), ('usb', 0.024439255061112542), ('on', 0.022339449578295967), ('tails electrum', 0.020823069785503738), ('electrum on', 0.020823069785503738)]
Topic 28:
[('coke', 0.19926153013926018), ('coke vendor', 0.03682358431435203), ('best coke', 0.0356582596743699), ('coke vendors', 0.026343335892151952), ('coke from', 0.023791402489163786), ('best', 0.02177867133690634), ('uk coke', 0.02116613307728994), ('coke in', 0.019791918945312438), ('good coke', 0.019662398560559707), ('brazil', 0.01825329276693766)]
Topic 29:
[('alprazolam', 0.07291878421114571), ('diazepam', 0.05477739270191512), ('powder', 0.044584877773492924), ('valium', 0.03850174408224983), ('etizolam', 0.03747570252373598), ('clonazolam', 0.03283601295203318), ('alprazolam powder', 0.02609385034775606), ('10mg', 0.024261819484092423), ('diclazepam', 0.018839623367253085), ('2mg', 0.017417134216026774)]
Topic 30:
[('ketamine', 0.23138118469385505), ('ketamine vendor', 0.03281186428650618), ('ketamine review', 0.025667538351183156), ('isomer', 0.021074224401587748), ('review', 0.01943650911383348), ('domestic ketamine', 0.018773573535844048), ('ketamine vendors', 0.018773573535844048), ('racemic', 0.01767338064151054), ('usa', 0.017429407965742466), ('best ketamine', 0.016880092192053206)]
Topic 31:
[('wsm', 0.18893118468595335), ('on wsm', 0.07838443324529455), ('on', 0.024366535891183275), ('wsm wsm', 0.022356672533994926), ('to wsm', 0.020997391001001165), ('wsm vendor', 0.020361364408962745), ('from wsm', 0.015549309090392706), ('of wsm', 0.014579701648874278), ('wsm is', 0.013247998435407521), ('vendor', 0.01278812957643258)]
Topic 32:
[('refund', 0.075497103967865), ('order', 0.06830056073098584), ('cancel', 0.05861589822091079), ('cancelled', 0.04863445727152318), ('auto', 0.03220391958726608), ('an order', 0.03182625667963673), ('canceled', 0.026137884406081138), ('orders', 0.02526335256672877), ('to cancel', 0.022855288976379076), ('an', 0.0225526675615654)]
Topic 33:
[('mushrooms', 0.10787088587575279), ('shrooms', 0.0621613105343965), ('magic', 0.05571050873232468), ('mushroom', 0.054955686308678885), ('cubensis', 0.04685554721548522), ('magic mushrooms', 0.044651482918634515), ('psychedelics', 0.03945945002111887), ('psilocybin', 0.028563844512759645), ('psilocybe', 0.027421290732249262), ('psilocybe cubensis', 0.024488605716793014)]
Topic 34:
[('order', 0.10292926971740386), ('orders', 0.06400314292162491), ('ordering', 0.028216852218753558), ('ordered', 0.02593596483341188), ('from', 0.025664534449295392), ('order from', 0.025506010837155306), ('an order', 0.021006475473566995), ('ordered from', 0.019380795040906413), ('an', 0.014858006837862698), ('my order', 0.014823145437907932)]
Topic 35:
[('adderall', 0.11427842206017935), ('30mg', 0.03655466173023898), ('pharmacy', 0.03032135856783765), ('ir', 0.028534232288490995), ('ritalin', 0.0273691072254764), ('vyvanse', 0.022935988533964613), ('20mg', 0.021641923329480183), ('brand name', 0.018684967029720622), ('30mg adderall', 0.017426452023171744), ('adderallz', 0.01686291747710415)]
Topic 36:
[('ddos', 0.17414239015277766), ('captcha', 0.07577548089460238), ('ddos attacks', 0.04302444055596632), ('attacks', 0.042004900462510805), ('the ddos', 0.038617577187419594), ('ddos attack', 0.03717492195361204), ('attack', 0.03645706154154813), ('under', 0.02007774942483335), ('the', 0.019890898912752225), ('under ddos', 0.01988571139017261)]
Topic 37:
[('xmr', 0.21138804641985695), ('xmr to', 0.05750289424464451), ('btc', 0.04131037525187534), ('to xmr', 0.03764193405073356), ('morphscript', 0.034505106213172425), ('to', 0.02626411666278744), ('xmr deposit', 0.023019130278705533), ('btc xmr', 0.02071853544066472), ('btc to', 0.020386561961046594), ('xmr xmr', 0.02006179072475952)]
Topic 38:
[('account', 0.08157920319621015), ('login', 0.057614194350978365), ('password', 0.04449622725318416), ('locked', 0.03976770548133053), ('log', 0.03247005948649909), ('locked out', 0.03165830799933661), ('my account', 0.030429683045385705), ('log in', 0.02689883375014806), ('recovery', 0.024509352417925918), ('my', 0.022002421237649776)]
Topic 39:
[('de', 0.04779639905862267), ('på', 0.03372102638336295), ('har', 0.024320398161032687), ('en', 0.024016587519234082), ('noen', 0.02069338999531763), ('er', 0.020648371518196852), ('som', 0.019101590764908583), ('fra', 0.018095669323824955), ('zion', 0.01765646467230412), ('para', 0.016092210850804776)]
Topic 40:
[('heroin', 0.19281429808000025), ('afghan', 0.029139972777356787), ('afghan heroin', 0.02446449007816335), ('heroin vendor', 0.022852789333319346), ('synthetic heroin', 0.022109315479878364), ('crack', 0.019116982789578758), ('cocaine', 0.018328855127266994), ('opium', 0.018041525538213487), ('synthetic', 0.017907448638768184), ('best heroin', 0.017659106201776603)]
Topic 41:
[('paypal', 0.141445389455225), ('bank', 0.09380445473852679), ('drop', 0.06380071112088324), ('bank drop', 0.061129883786000656), ('drops', 0.05659308611795184), ('transfers', 0.049744815859334615), ('bank drops', 0.04874961855138504), ('transfer', 0.042918339848051365), ('paypal transfers', 0.030022841387120944), ('cashout', 0.027298016669983445)]
Topic 42:
[('cc', 0.16431657826411347), ('cvv', 0.09433825676780017), ('vbv', 0.04616551317698169), ('non vbv', 0.028762115432700422), ('cc cvv', 0.027131741330170718), ('non', 0.025808529202315265), ('ccv', 0.025703754944372263), ('ccs', 0.025439312020644134), ('fullz', 0.02007433918779596), ('uk cc', 0.0198102395980812)]
Topic 43:
[('withdraw', 0.12289322331297811), ('withdrawal', 0.0785577127887274), ('withdrawals', 0.03186187266623986), ('to withdraw', 0.030267943437358953), ('withdrawl', 0.027243329733033055), ('pin', 0.02719511037435446), ('withdraw pin', 0.021940158833065233), ('btc', 0.02177432580984283), ('withdraws', 0.02057668981262508), ('working', 0.020044844625358132)]
Topic 44:
[('pills', 0.11126813852504376), ('xtc', 0.06292529793177554), ('xtc pills', 0.0571375463172878), ('pill', 0.05346705854762878), ('ecstasy', 0.0472681743528566), ('pill press', 0.023104528145599042), ('pillchills', 0.021957955442254703), ('ecstasy pills', 0.021700716243544197), ('press', 0.01981535592625848), ('xtc pill', 0.017885222360582386)]
Topic 45:
[('bars', 0.1613690467827236), ('bar', 0.0370835629564199), ('bunk', 0.03659380570861019), ('bunk bars', 0.019897671932218714), ('thebartender', 0.018810298991064574), ('alp', 0.017990425471111064), ('oc bars', 0.017794316603773463), ('oc', 0.017505628508061418), ('bars from', 0.017055147370473183), ('budgetbars', 0.015219071981035545)]
Topic 46:
[('cryptonia', 0.21123299187609967), ('on cryptonia', 0.09273819671628139), ('cryptonia market', 0.036141770391833056), ('on', 0.028423540857280643), ('cryptonia cryptonia', 0.027613651302351248), ('to cryptonia', 0.02293353518301816), ('now', 0.019288496099719173), ('at cryptonia', 0.01582069248026594), ('market', 0.015424512621159242), ('cryptonia is', 0.01392815556061474)]
Topic 47:
[('sale', 0.0669009177130793), ('promo', 0.030808587495418135), ('deals', 0.02013154966324256), ('products', 0.01809389540808409), ('price', 0.017055175637728157), ('20', 0.015416471561826933), ('sales', 0.014123334379440562), ('all', 0.013893851454684978), ('on all', 0.013871032069391573), ('prices', 0.013592431044111595)]
Topic 48:
[('crosspost', 0.16270972230216432), ('review crosspost', 0.025019188870844936), ('crosspost vendor', 0.01838207597271105), ('review', 0.016720105714953134), ('giveaway', 0.013660190788554766), ('crosspost review', 0.011645548578536474), ('2019 crosspost', 0.009958124639188928), ('vendor', 0.009949489051313667), ('giveaway crosspost', 0.009320042620928823), ('for', 0.0077798102009458215)]
Topic 49:
[('pack', 0.1364860666744876), ('packs', 0.11044224137068849), ('oc', 0.0421449573216045), ('landed', 0.039272308885937565), ('pack from', 0.033269023573027565), ('pack landed', 0.028504085141223018), ('packs from', 0.027278093710935654), ('landing', 0.0250916486947475), ('land', 0.024262921839508213), ('from', 0.02161306341104762)]
Topic 50:
[('benzo', 0.11057106932609045), ('benzos', 0.09263476300592406), ('rc', 0.0333035930349936), ('benzobananas', 0.029710130154272065), ('rc benzos', 0.025421318436476232), ('benzos4u', 0.02022653587516694), ('rc benzo', 0.01946620240831312), ('benzo vendor', 0.01912087633670443), ('benzoboys', 0.018962377382969003), ('liquid', 0.01797363735790245)]
Topic 51:
[('links', 0.14985119813634712), ('link', 0.14460198096056903), ('working', 0.11897238135796465), ('pm', 0.08294773649970093), ('working links', 0.07547889923403933), ('working link', 0.06837483843911635), ('pm me', 0.058270123126603415), ('me', 0.04673518044631064), ('please', 0.04145364644774408), ('link please', 0.03342161836140401)]
Topic 52:
[('vendor review', 0.13176904076454998), ('review', 0.10508725381281081), ('vendor', 0.07385331458017709), ('review vendor', 0.04329079272179012), ('reviews', 0.02661195949299305), ('feedback', 0.022680277962396123), ('vendor reviews', 0.01907132995225127), ('review for', 0.014085077763699176), ('review template', 0.012897663376432437), ('template', 0.011980985758793836)]
Topic 53:
[('dnm', 0.12825676892525106), ('dnms', 0.030088111144422336), ('dnstars', 0.024418674148458663), ('dn', 0.024162654979166785), ('the', 0.019843338747650742), ('the dnm', 0.0196448944940482), ('dnmuk', 0.01612662462301172), ('dm', 0.014714059947660639), ('of dnm', 0.013986898892251154), ('on dnm', 0.013003720302421853)]
Topic 54:
[('drugs', 0.1340007380450139), ('drug', 0.07777143413865256), ('drugsuk', 0.021498056085625307), ('drugs in', 0.020534253918424974), ('drugs are', 0.014828844915434722), ('to', 0.013938225834708464), ('the', 0.013475625258109669), ('how', 0.013044053233839281), ('selling drugs', 0.012941161757198665), ('of drugs', 0.012822992341620756)]
Topic 55:
[('speed', 0.183950814430372), ('speed paste', 0.06463035435808222), ('paste', 0.06440461831035135), ('speedbuster', 0.05917315935237283), ('mph', 0.05275738646237156), ('4f', 0.05275738646237156), ('4f mph', 0.05275738646237156), ('vendor speedbuster', 0.027511156012562783), ('vendorshop', 0.025218874791376196), ('speed vendor', 0.02439980096449533)]
Topic 56:
[('dmt', 0.2904789559151491), ('dmt vendor', 0.053730638923248135), ('aco dmt', 0.05260313518578659), ('aco', 0.051665177663252904), ('changa', 0.03326465228271408), ('for dmt', 0.03291369008538988), ('meo dmt', 0.030423677339823103), ('dmt vendors', 0.02913423306345144), ('dmt changa', 0.025253740985173603), ('meo', 0.02496057714410078)]
Topic 57:
[('escrow', 0.20917833515749504), ('in escrow', 0.03831598055804673), ('extend', 0.02307698150961151), ('extend escrow', 0.022350652521144168), ('escrow escrow', 0.01937368965193982), ('on escrow', 0.019133252797808968), ('escrow is', 0.01785189107154408), ('to extend', 0.015178660518416464), ('funds', 0.014105003627973312), ('with escrow', 0.014089956110501688)]
Topic 58:
[('mirror', 0.1999115213913919), ('mirrors', 0.17114555752439659), ('working', 0.09230346542206823), ('working mirror', 0.06698074861901485), ('working mirrors', 0.05348738062554266), ('mirror links', 0.041331468169803175), ('pm', 0.03250728698084981), ('mirror please', 0.02993888348824195), ('empire', 0.029327173164164985), ('pm me', 0.027800481444538137)]
Topic 59:
[('fraud', 0.20527232427652484), ('fraudsters', 0.029587415280798727), ('fraudfox', 0.02306152535746591), ('loan fraud', 0.02208411057803487), ('loan', 0.020765310304769444), ('for fraud', 0.017402408525299756), ('of fraud', 0.015875099058975835), ('best fraud', 0.01432044731610317), ('fraud and', 0.013749148978319373), ('forums', 0.013241012810604615)]
Topic 60:
[('bank', 0.16994005584365096), ('logs', 0.09399747297956386), ('bank logs', 0.07828367201948147), ('bank account', 0.046571339197672396), ('account', 0.03631266913625938), ('accounts', 0.032668200950556496), ('logins', 0.03198527642482958), ('bank logins', 0.028829321452800767), ('fullz', 0.02663355105468197), ('banks', 0.025240976495874503)]
Topic 61:
[('opsec', 0.2736853559343139), ('opsec for', 0.02909894491515325), ('opsec opsec', 0.02828295978078474), ('opsec and', 0.025348928187423684), ('opsec question', 0.02329669639014293), ('opsec guide', 0.022093620273672798), ('good opsec', 0.019959592629263525), ('bad opsec', 0.018855306520523158), ('my opsec', 0.01718392687952329), ('dnm', 0.016706764016850874)]
Topic 62:
[('fentanyl', 0.1955077892521244), ('fent', 0.08198709355740003), ('carfentanil', 0.025983205225545265), ('fentanyl distribution', 0.018954918299687127), ('of fentanyl', 0.018954918299687127), ('selling fentanyl', 0.018954918299687127), ('for fentanyl', 0.018954918299687127), ('distribution', 0.01807846715551803), ('analogues', 0.01713333362248868), ('in', 0.015865359129560205)]
Topic 63:
[('apollon', 0.18726518782162851), ('apollon market', 0.10163168288299376), ('on apollon', 0.0943424516858095), ('market', 0.04339820103949332), ('mysteryland', 0.03511425301193948), ('apollon apollon', 0.03036889048705953), ('on', 0.02815918727792239), ('to apollon', 0.022217092836984975), ('now', 0.022060915825673807), ('now on', 0.01946998401413869)]
Topic 64:
[('cgmc', 0.19823170606174625), ('invite', 0.18193935736305442), ('invite code', 0.08872314369090809), ('code', 0.0741401474069647), ('cgmc invite', 0.062124405870338295), ('on cgmc', 0.039104535817030044), ('an invite', 0.036559756378421876), ('invite codes', 0.03451579499235061), ('invites', 0.0334036319170144), ('codes', 0.03281925512609211)]
Topic 65:
[('aus', 0.0873531594985505), ('australia', 0.06952251222503282), ('aussie', 0.050977518898740276), ('australian', 0.03343382333915655), ('auspost', 0.029562213419453396), ('to australia', 0.02920361228056182), ('to aus', 0.02556850443461526), ('aussies', 0.025002204495531297), ('aussiehits', 0.0223786664446106), ('auspride', 0.02153665064886614)]
Topic 66:
[('phishing', 0.17699950168725506), ('phished', 0.06892906327420083), ('phishing links', 0.052061774436907814), ('links', 0.045282671799454555), ('warning', 0.028705934769355113), ('beware', 0.022619811704386532), ('got phished', 0.022599402854112032), ('link', 0.02249999517006053), ('phishing site', 0.02033946256870083), ('phishing link', 0.020151841887474525)]
Topic 67:
[('wallstreet', 0.18032562887151804), ('on wallstreet', 0.08164141622247753), ('wall', 0.04921975962741589), ('wallstreet market', 0.04187595188848951), ('wall st', 0.04158761120166923), ('st', 0.04115671976566791), ('wallstreetmarket', 0.039058613883569425), ('wallst', 0.03373377136206345), ('on', 0.02829346360257876), ('market', 0.020489164137491248)]
Topic 68:
[('chemicals', 0.10016938894460503), ('research', 0.09281847348498225), ('research chemicals', 0.07118366551106406), ('chems', 0.056909133858895296), ('chemical', 0.05475666577774158), ('research chemical', 0.034301108450939845), ('chem', 0.03224773598075937), ('chemist', 0.024020545800946164), ('com', 0.020917020525176258), ('chemtheory', 0.019376125674780503)]
Topic 69:
[('xtc', 0.06975510013050482), ('xmf', 0.06371143075822816), ('xans', 0.05423739326291133), ('xan', 0.052651185021238125), ('xanmasterfrank', 0.0506170121851223), ('xanos', 0.02402097080505311), ('xansalad', 0.020681665033123523), ('xtc vendor', 0.020240668489923782), ('xan vendor', 0.018096456903983083), ('free xtc', 0.01609315550468023)]
Topic 70:
[('2c', 0.21040678226208337), ('2cb', 0.14155506019153832), ('cb', 0.026888453075878907), ('2cb vendors', 0.024518631692465168), ('for 2c', 0.024518631692465168), ('for 2cb', 0.022197936333833275), ('2c 2c', 0.022197936333833275), ('domestic', 0.021947515946964532), ('us', 0.021687589917266496), ('domestic 2c', 0.021359286959244696)]
Topic 71:
[('review', 0.1293235354899173), ('reviews', 0.08095409212161189), ('review review', 0.04953955668613089), ('feedback', 0.021504094900986072), ('karmaking', 0.013294560556512758), ('reviews on', 0.011950872473099866), ('post', 0.011790311779232664), ('reviewer', 0.011706915341104806), ('sample review', 0.010996486731792397), ('order review', 0.010931383517996723)]
Topic 72:
[('samples', 0.14866103068403283), ('free samples', 0.09900928835552782), ('free', 0.08668026193176706), ('sample', 0.05214012554204901), ('samples on', 0.028360437056176527), ('samples free', 0.02512163566406409), ('free sample', 0.022945404374660198), ('samples left', 0.016630035400317673), ('our products', 0.016630035400317673), ('on all', 0.016404806972083175)]
Topic -1:
[('to', 0.009994879918409097), ('the', 0.008779454354158061), ('is', 0.008650560310022778), ('for', 0.008340702261717663), ('anyone', 0.007211720090693047), ('with', 0.006872813640534324), ('how', 0.006854467093263099), ('and', 0.006784165355486146), ('in', 0.006723924586602931), ('on', 0.006544858676467028)]
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(new_topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(new_topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.451997309923172 Davies_bouldin_score: 1.137154174755223
topic_words = topic_model.get_topics()
topics_ll = [topic_words[i] for i in range(len(topic_words) - 1) if i != -1]
topn = 10
topic_list = []
for topic in topics_ll:
topic_list.append([word for word, _ in topic[:topn]])
coherence_model = CoherenceModel(
topics=topic_list,
texts=[doc.split() for doc in tc1.corpus],
dictionary=corpora.Dictionary([doc.split() for doc in tc1.corpus]),
coherence='c_v'
)
print(f"Coherence Model: {coherence_model.get_coherence()}")
Coherence Model: 0.5139428428742826
topic_model.visualize_barchart(top_n_topics=70, custom_labels=True, n_words=10)
sentence = ['sell xanax, coke, weed, gun and password']
tp, pr = topic_model.transform(sentence)
top_indices = np.argsort(pr[0])[::-1][:5]
top_topics = [(topic_model.get_topic(i), pr[0][i], topic_model.generate_topic_labels()[i+1]) for i in top_indices]
df_finals = pd.DataFrame(top_topics, columns=['Topic', 'Probability', 'Label'])
df_finals['Words'] = df_finals['Topic'].apply(lambda topic: [word for word, prob in topic])
df_finals['Sentence'] = sentence * len(df_finals)
df_finals
Batches: 100%|██████████| 1/1 [00:00<00:00, 3.74it/s] 2024-07-01 18:07:22,315 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings. 2024-07-01 18:07:32,083 - BERTopic - Dimensionality - Completed ✓ 2024-07-01 18:07:32,083 - BERTopic - Clustering - Approximating new points with `hdbscan_model` 2024-07-01 18:07:32,099 - BERTopic - Probabilities - Start calculation of probabilities with HDBSCAN 2024-07-01 18:07:32,212 - BERTopic - Probabilities - Completed ✓ 2024-07-01 18:07:32,212 - BERTopic - Cluster - Completed ✓
| Topic | Probability | Label | Words | Sentence | |
|---|---|---|---|---|---|
| 0 | [(alprazolam, 0.06763628572821677), (diazepam,... | 0.013500 | 29_alprazolam_diazepam_powder | [alprazolam, diazepam, powder, valium, etizola... | sell xanax, coke, weed, gun and password |
| 1 | [(xanax, 0.13073115730988635), (bars, 0.032445... | 0.013155 | 8_xanax_bars_xanax bars | [xanax, bars, xanax bars, 2mg, xanax vendor, 3... | sell xanax, coke, weed, gun and password |
| 2 | [(oxycodone, 0.061735118720444515), (oxy, 0.04... | 0.008704 | 21_oxycodone_oxy_opiateconnect | [oxycodone, oxy, opiateconnect, 80mg, opiates,... | sell xanax, coke, weed, gun and password |
| 3 | [(benzo, 0.10894708695513186), (benzos, 0.0934... | 0.007228 | 50_benzo_benzos_rc | [benzo, benzos, rc, rc benzos, benzobananas, b... | sell xanax, coke, weed, gun and password |
| 4 | [(pills, 0.11040973400268793), (xtc, 0.0623399... | 0.007184 | 44_pills_xtc_xtc pills | [pills, xtc, xtc pills, pill, ecstasy, pill pr... | sell xanax, coke, weed, gun and password |
KeyBERT¶
df = pd.read_csv('cleaned_data_name_thread.csv')
df = df.dropna(subset=['name_thread'])
df = df.drop_duplicates(subset=['name_thread'], keep='first')
docs = df['name_thread'].tolist()
kw_model = KeyBERT()
df = pd.read_csv('cleaned_data_name_thread.csv')
df = df.dropna(subset=['name_thread'])
df = df.drop_duplicates(subset=['name_thread'], keep='first')
docs = df['name_thread'].tolist()
keywords = kw_model.extract_keywords(docs)
vocabulary = [k[0] for keyword in keywords for k in keyword]
vocabulary = list(set(vocabulary))
model = SentenceTransformer('all-MiniLM-L6-v2')
embedding = model.encode(vocabulary, batch_size=32, show_progress_bar=True)
Batches: 100%|██████████| 726/726 [01:28<00:00, 8.24it/s]
vectorizer_model= CountVectorizer(analyzer = 'word', ngram_range = (1, 5), vocabulary = vocabulary, lowercase=False)
topic_model_kw = BERTopic(vectorizer_model=vectorizer_model, nr_topics='auto', min_topic_size=120)
topics_kw, probs_kw = topic_model_kw.fit_transform(df.name_thread)
topic_model_kw.visualize_barchart(top_n_topics=200, custom_labels=True, n_words=10)
topic_model_kw.get_topic_info()
| Topic | Count | Name | Representation | Representative_Docs | |
|---|---|---|---|---|---|
| 0 | -1 | 23811 | -1_order_vendor_account_review | [order, vendor, account, review, mg, new, look... | [best market bulk bar alp powder, uk best pric... |
| 1 | 0 | 11480 | 0_weed_mg_xanax_cart | [weed, mg, xanax, cart, lsd, cocaine, pill, ca... | [product review Dimmey Strawberry Banana Sherb... |
| 2 | 1 | 6261 | 1_scammer_scam_vendor_scamming | [scammer, scam, vendor, scamming, market, drea... | [scammer, SOLUTION TO market EXIT scamming, Pa... |
| 3 | 2 | 2273 | 2_guy_help_sub_post | [guy, help, sub, post, message, day, advice, u... | [Sup guy new, please help advice, sub Please R... |
| 4 | 3 | 1760 | 3_empire_working_deposit_market | [empire, working, deposit, market, link, ticke... | [top Notch DM Vendor Mahatma empire, JerryGarc... |
| ... | ... | ... | ... | ... | ... |
| 60 | 59 | 124 | 59_vending_libertas_market_network | [vending, libertas, market, network, start, mo... | [vending Nightmare Market, vending Labels, ven... |
| 61 | 60 | 123 | 60_subdread_subdreads_create_welcome | [subdread, subdreads, create, welcome, require... | [New subdread Developer Project Partnerships, ... |
| 62 | 61 | 123 | 61_seized_seizure_package_raided | [seized, seizure, package, raided, letter, not... | [Need Help Damage Control seized, Mail seized,... |
| 63 | 62 | 122 | 62_tochka_vps_nocturno_transaction | [tochka, vps, nocturno, transaction, installat... | [VENDOR OF LEAN tochka, Waxthtazz tochka, tochka] |
| 64 | 63 | 121 | 63_berlusconi_market_lovedoctor_good | [berlusconi, market, lovedoctor, good, scammer... | [market good berlusconi, cant see berlusconi m... |
65 rows × 5 columns
topic_words = topic_model_kw.get_topics()
topics_ll = [topic_words[i] for i in range(len(topic_words) - 1) if i != -1]
topn = 10
topic_list = []
for topic in topics_ll:
topic_list.append([word for word, _ in topic[:topn]])
coherence_model = CoherenceModel(
topics=topic_list,
texts=[doc.split() for doc in df.name_thread],
dictionary=corpora.Dictionary([doc.split() for doc in df.name_thread]),
coherence='c_v'
)
print(f"Coherence Model: {coherence_model.get_coherence()}")
Coherence Model: 0.36753855014096726
topic_model_kw.visualize_topics()